Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into develop

38576db5 · yongqiang · 34c620b5 · 4db9f3a5 · 38576db5 · 38576db5
690 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+#flatbuffers
+lite/model_parser/flatbuffers/framework_generated.h
 build*
+# hiai libs
+ai_ddk_lib*
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/protobuf-host"]
 	path = third-party/protobuf-host
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "third-party/flatbuffers"]
+	path = third-party/flatbuffers
+	url = https://github.com/google/flatbuffers.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
+lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -106,7 +108,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
-lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." OFF)
+lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
 lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
@@ -168,6 +171,7 @@ if(LITE_WITH_RKNPU)
   include(device/rknpu)
 endif()
+include(external/flatbuffers)
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -222,6 +226,11 @@ endif()
 if(LITE_WITH_MLU)
    include(mlu)
 endif()
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    include(device/huawei_ascend_npu)
+endif()
 include(coveralls)
 include(external/mklml)     # download mklml package

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()
+if (LITE_WITH_HUAWEI_ASCEND_NPU)
+add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
  add_definitions("-DLITE_WITH_LOG")
 endif()
+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
 if (LITE_ON_TINY_PUBLISH)
  add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -35,7 +35,11 @@ endif()
 if(NOT DEFINED ANDROID_API_LEVEL)
    set(ANDROID_API_LEVEL "23")
    if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-        set(ANDROID_API_LEVEL "22")
+        if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
+            set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
+        else()
+            set(ANDROID_API_LEVEL "22")
+        endif()
    endif()
 endif()
@@ -76,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
        set(triple arm-v7a-linux-android)
        set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if (ANDROID_API_LEVEL LESS 21)
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
+            endif()
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
    else()
        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
    endif()

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()
 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
    return()
 endif()
+if(WITH_CUDA_FP16)
+  add_definitions("-DCUDA_WITH_FP16")
+endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
+if (CUDA_WITH_FP16)
+  STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
+endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 if(NOT WITH_DSO)
    if(WIN32)

--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+# 1. path to Huawei Ascend Install Path
+if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
+    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
+    if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
+    endif()
+endif()
+message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
+# 2. Huawei Ascend include directory
+set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
+set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
+set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
+include_directories(${ACL_INCLUDE_DIR})
+include_directories(${ATC_INCLUDE_DIR})
+include_directories(${OPP_INCLUDE_DIR})
+# 3 find ACL Libs (ACL libs should before ATC libs)
+find_library(ACL_ASCENDCL_FILE NAMES ascendcl
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+if(NOT ACL_ASCENDCL_FILE)
+  message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
+  add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
+endif()
+# 3.1 ascendcl dependency - libruntime.so
+find_library(ACL_RUNTIME_FILE NAMES runtime
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+if(NOT ACL_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
+  add_library(acl_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
+endif()
+# 4.1 find ATC libs - libregister.so
+find_library(ATC_REGISTER_FILE NAMES register
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
+  add_library(atc_register SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
+endif()
+# 4.1.1 dependency of register - libprotobuf.so.19,
+find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+  if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
+  add_library(atc_protobuf SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
+endif()
+# 4.1.2 dependency of register - libgraph.so
+find_library(ATC_GRAPH_FILE NAMES graph
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_GRAPH_FILE)
+  message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
+  add_library(atc_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
+endif()
+# 4.2 find ATC libs - libge_compiler.so
+find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_GE_COMPILER_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
+  add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
+endif()
+# 4.2.1 dependencies of libge_compiler.so - libge_common.so
+find_library(ATC_GE_COMMON_FILE NAMES ge_common
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_GE_COMMON_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
+  add_library(atc_ge_common SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
+endif()
+# 4.2.3 dependencies of libge_compiler.so - libresource.so
+find_library(ATC_RESOURCE_FILE NAMES resource
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_RESOURCE_FILE)
+  message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
+  add_library(atc_resource SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
+endif()
+# 4.3 find OPP libs - libopsproto.so
+find_library(OPP_OPS_PROTO_FILE NAMES opsproto
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
+  NO_DEFAULT_PATH)
+if(NOT OPP_OPS_PROTO_FILE)
+  message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
+else()
+  message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
+  add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
+  set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
+endif()
+# 4.3.1 dependency of  opp_ops_proto - liberror_manager.so
+find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+if(NOT ATC_ERROR_MANAGER_FILE)
+  message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
+  add_library(atc_error_manager SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
+endif()
+# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
+set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
+set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager 
+    atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
  NO_DEFAULT_PATH)
+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
 if(NOT NPU_DDK_HIAI_FILE)
  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
  set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()
-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -39,7 +39,7 @@ else()
 endif()
 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
  NO_DEFAULT_PATH)
 if(NOT XPU_SDK_XPU_RT_FILE)

--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+INCLUDE(GNUInstallDirs)
+SET(LIBDIR "lib")
+if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
+  SET(LIBDIR "lib64")
+endif()
+SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
+SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
+SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
+SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
+IF(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ELSE(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
+if(NOT HOST_CXX_COMPILER)
+  set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
+endif()
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
+ExternalProject_Add(
+    extern_flatbuffers
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/flatbuffers.git"
+    GIT_TAG         "v1.12.0"
+    SOURCE_DIR      ${FLATBUFFERS_SOURCES_DIR}
+    PREFIX          ${FLATBUFFERS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
+                    -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+                    -DFLATBUFFERS_BUILD_TESTS=OFF
+                    ${CROSS_COMPILE_CMAKE_ARGS}
+                    ${OPTIONAL_ARGS}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
+    add_custom_command(TARGET extern_flatbuffers POST_BUILD
+            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
+            )
+  ENDIF()
+ENDIF(WIN32)
+ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
+ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
+SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
+function(register_generated_output file_name)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  list(APPEND tmp ${file_name})
+  set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
+endfunction(register_generated_output)
+function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            --cpp --gen-mutable --gen-object-api --reflect-names
+            ${OPT}
+            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatbuffers
+    COMMENT "Run generation: '${GEN_HEADER}'")
+  register_generated_output(${GEN_HEADER})
+  add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
+endfunction()
+set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
+set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
+compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+include_directories(${FLATBUFFERS_INCLUDE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_HUAWEI_ASCEND_NPU)
+    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )
    if (args_SHARED OR ARGS_shared)
@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    if(NOT WIN32)
@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
+set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
        endif()
        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
+        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -499,7 +519,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -537,6 +557,7 @@ function(add_operator TARGET level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/demo/cxx/train_demo/README.md
+++ b/lite/demo/cxx/train_demo/README.md
+# C++ Train Demo
-# Introduction
+## Introduction
-  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
-  你可以通过book库中的
+你可以通过book库中的
 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
 和
 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
@@ -10,18 +12,16 @@
 其使用线性回归（Linear Regression）
 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
-注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
+## Requirements
-# Requirements
 - 一部安卓手机，用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python
+- 装了Paddle (version >= 1.7.0) 的python
-# Quick start
+## Quick start
-## Step1 build paddle-lite
+### Step1 build paddle-lite
-请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
 ```shell
 ## 配置环境
@@ -51,7 +51,7 @@ cd Paddle-Lite
 Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
 ```
-## Step2 编译lr_trainer
+### Step2 编译lr_trainer
 ```shell
 cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
@@ -64,7 +64,7 @@ bin/
 `-- demo_trainer
 ```
-## Step3 download model and run it!
+### Step3 download model and run it!
 在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：
@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
-# 更多细节
+## 更多细节
 上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：
 ```shell
@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
 如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。
-# 与Paddle训练结果做校对
+## 与Paddle训练结果做校对
-## 前10个Loss值
+### 前10个Loss值
 为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。
@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
-## Loss 曲线
+### Loss 曲线
 控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。
-![lr_loss](image/lr_loss.png)
+![lr_loss](../images/lr_loss.png)
 如果想复现上述效果，paddle+python的运行命令为：

--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
 rm ./lite/api/paddle_use_ops.h
 # 设置编译参数并开始编译
+# android-armv7:cpu+gpu+cv+extra
 ./lite/tools/build_android.sh \
  --arch=armv7 \
  --toolchain=clang \
-  --with_cv=OFF \
  --with_log=OFF \
-  --with_extra=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
  --with_opencl=ON
+# android-armv8:cpu+gpu+cv+extra
+./lite/tools/build_android.sh \
+  --arch=armv8 \
+  --toolchain=clang \
+  --with_log=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
+  --with_opencl=ON
 # 注：编译帮助请执行: ./lite/tools/build_android.sh help
 ```
@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
 ## 3. 如何在Code中使用
-即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)，其中也包括判断当前设备是否支持OpenCL的方法;
 注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。

--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
 predictor = create_paddle_predictor(config)
 ```
-(3) 设置输入数据
+(3) 从图片读入数据
+```python
+image = Image.open('./example.jpg')
+resized_image = image.resize((224, 224), Image.BILINEAR)
+image_data = np.array(resized_image).flatten().tolist()
+```
+(4) 设置输入数据
 ```python
 input_tensor = predictor.get_input(0)
 input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
+input_tensor.set_float_data(image_data)
 ```
-(4) 执行预测
+(5) 执行预测
 ```python
 predictor.run()
 ```
-(5) 得到输出数据
+(6) 得到输出数据
 ```python
 output_tensor = predictor.get_output(0)
 print(output_tensor.shape())

--- a/lite/demo/cxx/train_demo/image/lr_loss.png
+++ b/lite/demo/cxx/train_demo/image/lr_loss.png
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/baidu_xpu
  demo_guides/rockchip_npu
  demo_guides/mediatek_apu
+.. toctree::
+  :maxdepth: 1
+  :caption: 训练示例（预览）
+  :name: sec-train_demo_guides
+  demo_guides/cpp_train_demo
 .. toctree::
  :maxdepth: 1
  :caption: API文档

--- a/docs/user_guides/Compile/iOS.md
+++ b/docs/user_guides/Compile/iOS.md
@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8                iOS预测库和头文件
 - 裁剪预测库方法（只编译模型中的kernel&OP，降低预测库体积）:
 ```shell
-./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
+./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
 ```
 ```shell
 --with_strip: (OFF|ON);   是否根据输入模型裁剪预测库，默认为OFF

--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -21,11 +21,11 @@ pip install paddlelite
 - 方法二: 下载opt可执行文件
 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择当前预测库对应版本的`opt`转化工具
-本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载
+本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载
 |版本 | Linux | MacOS|
 |---|---|---|
-| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
 |`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
 - 方法三: 源码编译opt

--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -49,4 +49,4 @@ $ ./opt \
 ## 五. 测试工具
-为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
+message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -45,14 +46,17 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
    endif()
    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
-	      lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
    endif()
 endif()
@@ -242,7 +246,6 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
    add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-           COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)
-endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+endif()
 if (LITE_ON_TINY_PUBLISH)
    set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
@@ -11,12 +11,13 @@ endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
-    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
+    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
-    target_link_libraries(paddle_full_api_shared framework_proto)
+    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
+    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
@@ -39,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  NPU_DEPS ${npu_kernels}
                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
+                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                  )
    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
    if(WIN32)
        target_link_libraries(paddle_light_api_shared shlwapi.lib)
    endif()
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
   if(APPLE)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
        set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
@@ -70,7 +72,7 @@ else()
            set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
        endif()
        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -93,6 +95,7 @@ if (WITH_TESTING)
      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      APU_DEPS ${apu_kernels})
 endif()
@@ -111,6 +114,10 @@ if(LITE_WITH_RKNPU)
    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
+endif()
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
@@ -125,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
+message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -143,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
-                        FPGA_DEPS ${fpga_kernels})
+                        FPGA_DEPS ${fpga_kernels}
+                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 # for light api
@@ -167,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
-        MLU_DEPS ${mlu_kernels})
+        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -190,6 +200,7 @@ if(WITH_TESTING)
           FPGA_DEPS ${fpga_kernels}
           BM_DEPS ${bm_kernels}
           MLU_DEPS ${mlu_kernels}
+           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
           EXCLUDE_COMPILE_DEPS "ON"
           ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -321,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
    target_link_libraries(paddle_api_full ${cuda_deps})
@@ -361,6 +373,9 @@ endif()
 if (LITE_WITH_PYTHON)
    add_subdirectory(python)
+    # add library for opt_base
+    lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+    add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 endif()
 if (LITE_ON_TINY_PUBLISH)
@@ -368,9 +383,6 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
-# add library for opt_base
-lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
-add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
    message(STATUS "Compiling opt")
@@ -393,6 +405,7 @@ if(NOT WITH_COVERAGE)
      FPGA_DEPS ${fpga_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
    if (WITH_TESTING)
        add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -414,7 +427,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -429,7 +443,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -444,7 +459,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -458,7 +474,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -469,8 +486,9 @@ if(NOT IOS)
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        CL_DEPS ${opencl_kernels}
-	BM_DEPS ${bm_kernels}
+        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -486,7 +504,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
    # Unlike static library, module library has to link target to be able to work
    # as a single .so lib.
    target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    add_dependencies(paddle_lite_jni framework_fbs_header)
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -31,7 +32,7 @@ else()
    endif()
    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
    if (LITE_WITH_NPU)
        # Need to add HIAI runtime libs (libhiai.so) dependency
        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,26 +13,31 @@
 // limitations under the License.
 #include "lite/api/cxx_api.h"
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
 #include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 namespace paddle {
 namespace lite {
+std::vector<std::string> GetAllOps() {
+  return OpLiteFactory::Global().GetAllOps();
+}
 void Predictor::SaveModel(const std::string &dir,
                          lite_api::LiteModelType model_type,
                          bool record_info) {
  if (!program_) {
    GenRuntimeProgram();
  }
-  program_->SaveOpInfosToProgram(program_desc_.get());
+  program_->SaveToProgram(program_desc_);
-  program_->UpdateVarsOfProgram(program_desc_.get());
  switch (model_type) {
    case lite_api::LiteModelType::kProtobuf:
      SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
@@ -52,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
  std::set<std::string> ops_info;
  std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
+  auto block_size = program_->block_size();
-  for (auto &node : instructions_) {
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
-    // parse op type infomation
+    const auto &insts = program_->instructions(block_idx);
-    auto op = node.op()->op_info();
+    for (auto &inst : insts) {
-    ops_info.insert(op->Type());
+      // parse op type infomation
-    // parse kernel type information
+      auto op = inst.op()->op_info();
-    std::string kernel_type_str =
+      ops_info.insert(op->Type());
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
+      // parse kernel type information
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
+      std::string kernel_type_str =
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
-    kernels_info.insert(kernel_type_str);
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
+      kernels_info.insert(kernel_type_str);
+    }
  }
  // get souce_file name from op type and kernel type
@@ -164,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
  std::vector<const cpp::OpDesc *> feeds;
  std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
+  const auto &insts = program_->instructions(kRootBlockIdx);
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
+  for (auto &inst : insts) {
-    const auto &op = insts[i].op()->op_info();
+    const auto &op = inst.op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
@@ -249,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
  } else {
    LOG(INFO) << "Load model from file.";
  }
  Build(model_path,
        model_file,
        param_file,
@@ -290,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
  Build(program_desc_, valid_places, passes);
 }
-void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
@@ -326,13 +334,11 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
    }
  }
  if (is_quantized_model) {
-#ifdef LITE_WITH_ARM
    inner_places.insert(inner_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});
-#endif
  }
-  Program program(*desc.get(), scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);
  valid_places_ = inner_places;
  core::KernelPickFactor factor;

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
    ".tailored_kernels_source_list";
 static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+std::vector<std::string> GetAllOps();
 /*
 * Predictor for inference, input a model, it will optimize and execute it.
 */
@@ -47,18 +49,33 @@ class LITE_API Predictor {
    program_desc_ = std::make_shared<cpp::ProgramDesc>();
  }
-  // Create a predictor with the weight variable scope set.
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. Create a predictor with the
+  // weight variable scope set given.
+  ///////////////////////////////////////////////////////////////////
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}
-  Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. This constructor function can
+  // only be called in Predictor->Clone. This Function will create
+  // a predictor from existed ProgramDesc, Scope and RuntimeProgram.
+  ///////////////////////////////////////////////////////////////////
+  Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
            const std::shared_ptr<Scope>& root,
            const std::vector<Place>& valid_places,
            const std::vector<std::string>& var_names = {})
-      : program_desc_(desc), scope_(root) {
+      : program_desc_(program_desc), scope_(root) {
-    Program program(*desc.get(), scope_, valid_places, var_names);
+    // step1. Create a Program to construct the exec_scope and ops
-    optimizer_ = Optimizer(std::move(program), valid_places);
+    Program program(program_desc_, scope_, valid_places, var_names);
-    exec_scope_ = optimizer_.exec_scope();
+    exec_scope_ = program.exec_scope();
    valid_places_ = valid_places;
+    // step3. Create the RuntimeProgram.
+    program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
+    program_generated_ = true;
  }
  // Build from a model, with places set for hardware config.
@@ -77,32 +94,62 @@ class LITE_API Predictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool memory_from_memory = false);
-  void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& passes = {});
-  std::shared_ptr<Predictor> Clone() const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // in scope_ with the original predictor.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone() {
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
    auto predictor =
        std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
+    // step3. Return the result
    return predictor;
  }
+  //////////////////////////////////////////////////////////
-  std::shared_ptr<Predictor> Clone(
+  // Function: Clone(var_names)
-      const std::vector<std::string>& var_names) const {
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // but persistable variables of name var_names will not
+  // be shared.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
    CHECK(program_desc_) << "Both program and scope of current predicotr "
                            "should be not be nullptr in Clone mode.";
    CHECK(scope_) << "Both program and scope of current predicotr should be "
                     "not be nullptr in Clone mode.";
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
    auto predictor = std::make_shared<Predictor>(
        program_desc_, scope_, valid_places_, var_names);
+    // step3. Copy some persistable variables into private scope.
-    for (auto i : var_names) {
+    for (auto var_name : var_names) {
-      predictor->exec_scope_->LocalVar(i);
+      predictor->exec_scope_->LocalVar(var_name);
-      auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
+      auto* tensor =
+          predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
      auto* sub_tensor =
-          predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+          predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
      sub_tensor->CopyDataFrom(*tensor);
    }
+    // step4. Return the result
    return predictor;
  }
@@ -138,6 +185,7 @@ class LITE_API Predictor {
  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }
  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(
@@ -160,7 +208,7 @@ class LITE_API Predictor {
  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::shared_ptr<Scope> scope_;
  Scope* exec_scope_;
-  std::unique_ptr<RuntimeProgram> program_;
+  std::shared_ptr<RuntimeProgram> program_;
  bool program_generated_{false};
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 #ifdef LITE_WITH_MLU
    Env<TARGET(kMLU)>::Init();
-    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
-                                             config.mlu_core_number(),
+                                          config.mlu_core_number(),
-                                             config.mlu_use_first_conv(),
+                                          config.mlu_input_layout(),
-                                             config.mlu_first_conv_mean(),
+                                          config.mlu_firstconv_param());
-                                             config.mlu_first_conv_std(),
-                                             config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
    auto use_layout_preprocess_pass =
        config.model_dir().find("OPENCL_PRE_PRECESS");
@@ -75,6 +73,18 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  }
  mode_ = config.power_mode();
  threads_ = config.threads();
+#ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
  int num_threads = config.x86_math_library_num_threads();

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -15,8 +15,6 @@
 #include "lite/api/light_api.h"
 #include <algorithm>
 #include <map>
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
 namespace paddle {
 namespace lite {
@@ -24,17 +22,18 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                           bool model_from_memory) {
  if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
  } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
  }
  // For weight quantization of post training, load the int8/16 weights
  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
+  BuildRuntimeProgram(program_desc_);
-  BuildRuntimeProgram(cpp_program_desc_);
  PrepareFeedFetch();
+  program_desc_.reset();
 }
 void LightPredictor::Build(const std::string& model_dir,
@@ -45,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
  switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
    case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
      break;
 #endif
    case lite_api::LiteModelType::kNaiveBuffer: {
      if (model_from_memory) {
        LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
      } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
      }
      break;
    }
@@ -62,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
  }
  DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }
@@ -111,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+  std::vector<const cpp::OpDesc*> feeds;
-  std::vector<cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
-  std::vector<cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
+  auto op_size = main_block->OpsSize();
-    if (op->Type() == "feed") {
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
-      feeds.push_back(op);
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
-    } else if (op->Type() == "fetch") {
+    if (op_desc->Type() == "feed") {
-      fetchs.push_back(op);
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
    }
  }
  input_names_.resize(feeds.size());
@@ -134,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
  }
 }
-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
+void LightPredictor::BuildRuntimeProgram(
-  std::vector<Instruction> insts;
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
-  // 1. Create op first
+  auto* exe_scope = &scope_->NewScope();
-  Program program(prog, scope_, {});
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
-// 2. Create Instructs
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
-#ifdef LITE_WITH_OPENCL
+  CHECK(program_desc);
-  using OpenCLContext = Context<TargetType::kOpenCL>;
+  auto block_size = program_desc->BlocksSize();
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  CHECK(block_size);
-  local_ctx->As<OpenCLContext>().InitOnce();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
-#endif
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
-  // Create the kernels of the target places, and filter out the specific
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
-  // kernel with the target alias.
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
-  for (auto& op : program.ops()) {
+      if (!var_desc->Persistable()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+        exe_scope->Var(var_desc->Name());
-    std::string op_type, alias;
+      } else {
-    Place place;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+        scope_->Var(var_desc->Name());
-    auto kernels = op->CreateKernels({place});
+      }
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
-    } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
    }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-    insts.emplace_back(op, std::move(*it));
  }
-  program_.reset(new RuntimeProgram(std::move(insts)));
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
-  CHECK(program.exec_scope());
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
-  program_->set_exec_scope(program.exec_scope());
 }
 void LightPredictor::DequantizeWeight() {
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
  for (int64_t i = 0; i < ch; ++i) {                                      \
    for (int64_t j = 0; j < offset; ++j) {                                \
@@ -207,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
    }
    return result;
  };
  Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
-    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
    for (size_t k = 0; k < block->OpsSize(); ++k) {
      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
      if (is_weight_quantized_op(op_desc)) {

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
  LightPredictor(const std::string& lite_model_file,
                 bool model_from_memory = false) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(lite_model_file, model_from_memory);
  }
@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                 lite_api::LiteModelType model_type =
                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }
@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }
 private:
  void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool model_from_memory = false);
-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
  void DequantizeWeight();
 private:
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
 };

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  threads_ = config.threads();
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
      config.subgraph_model_cache_dir());
 #endif
 }

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
    } else if (target_repr == "mlu") {
@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kXPU",
                                      "kRKNPU",
                                      "kAPU",
+                                      "kHuaweiAscendNPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -265,16 +268,17 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
      valid_places_.emplace_back(TARGET(kX86));
    } else if (target_repr == "npu") {
      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places_.emplace_back(TARGET(kXPU));
    } else if (target_repr == "rknpu") {
@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
      "default\n"
      "        `set_lite_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `record_model_info(false|true)`: refer to whether to record ops "
      "info for striping lib, false by default`\n"
      "        `run() : start model transformation`\n"
@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "paddlelite opt version:" << opt_version << std::endl
            << help_info << std::endl;
@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kXPU",
                                                     "kRKNPU",
                                                     "kAPU",
+                                                     "kHuaweiAscendNPU",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 #include "lite/api/paddle_api.h"
+#include <utility>
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
@@ -21,10 +24,30 @@
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/target_wrapper.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_runtime.h"
+#endif
 namespace paddle {
 namespace lite_api {
+bool IsOpenCLBackendValid() {
+  bool opencl_valid = false;
+#ifdef LITE_WITH_OPENCL
+  opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
+#endif
+  LOG(INFO) << "opencl_valid:" << opencl_valid;
+  return opencl_valid;
+}
 Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
 // TODO(Superjomn) refine this by using another `const void* const_raw`;
@@ -97,6 +120,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
@@ -117,6 +147,13 @@ void Tensor::CopyToCpu(T *data) const {
        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
@@ -138,6 +175,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
+template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
+template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
 template void Tensor::CopyToCpu(int8_t *) const;
@@ -228,13 +270,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
 void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
  mlu_input_layout_ = layout;
 }
-void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
-  mlu_use_first_conv_ = use_first_conv;
+                                        const std::vector<float> &std) {
-}
-void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
  mlu_first_conv_mean_ = mean;
-}
-void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
  mlu_first_conv_std_ = std;
 }
 lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
@@ -242,18 +280,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 }
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
-bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+std::pair<std::vector<float>, std::vector<float>>
-const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+CxxConfig::mlu_firstconv_param() const {
-  return mlu_first_conv_mean_;
+  return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
-}
-const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
-  return mlu_first_conv_std_;
 }
 #endif
 void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+  lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
 #else
  LOG(WARNING) << "The invoking of the function "
                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
@@ -263,7 +298,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+  lite::TargetWrapperXPU::SetDev(dev_no);
 #else
  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -272,7 +307,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
+  lite::TargetWrapperXPU::multi_encoder_precision = precision;
 #else
  LOG(WARNING) << "The invoking of the function "
                  "'set_xpu_multi_encoder_precision' is "

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -21,6 +21,7 @@
 #define PADDLE_LITE_API_H_
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle_place.h"  // NOLINT
@@ -32,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
+// return true if current device supports OpenCL model
+LITE_API bool IsOpenCLBackendValid();
 struct LITE_API Tensor {
  explicit Tensor(void* raw);
  explicit Tensor(const void* raw);
@@ -122,6 +126,7 @@ class LITE_API ConfigBase {
  PowerMode mode_{LITE_POWER_NO_BIND};
  // to save subgraph model for npu/xpu/...
  std::string subgraph_model_cache_dir_{""};
+  int device_id_{0};
 public:
  explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -141,6 +146,9 @@ class LITE_API ConfigBase {
  const std::string& subgraph_model_cache_dir() const {
    return subgraph_model_cache_dir_;
  }
+  // set Device ID
+  void set_device_id(int device_id) { device_id_ = device_id; }
+  const int get_device_id() const { return device_id_; }
 };
 /// CxxConfig is the config for the Full feature predictor.
@@ -160,9 +168,8 @@ class LITE_API CxxConfig : public ConfigBase {
  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
  int mlu_core_number_{1};
  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
-  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_{};
-  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_{};
-  std::vector<float> mlu_first_conv_std_;
 #endif
 public:
@@ -210,24 +217,22 @@ class LITE_API CxxConfig : public ConfigBase {
  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
  // set MLU core number, which is used when compiling MLU kernels
  void set_mlu_core_number(int core_number);
-  // set MLU input layout. User can specify layout of input data to be NHWC,
-  // default is NCHW
-  void set_mlu_input_layout(DataLayoutType layout);
  // whether use MLU's first conv kernel. First conv is a special kernel
  // provided by MLU, its input is uint8, and also needs two 3-dimentional
  // vectors which save all inputs' mean and std values
-  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector and 3-dimentional std vector used by
-  // set the 3-dimentional mean vector used by MLU's first conv
+  // MLU's first conv
-  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  void set_mlu_firstconv_param(const std::vector<float>& mean,
-  // set the 3-dimentional std vector used by MLU's first conv
+                               const std::vector<float>& std);
-  void set_mlu_first_conv_std(const std::vector<float>& std);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
  lite_api::MLUCoreVersion mlu_core_version() const;
  int mlu_core_number() const;
  DataLayoutType mlu_input_layout() const;
-  bool mlu_use_first_conv() const;
+  // std::pair<mean, std>
-  const std::vector<float>& mlu_first_conv_mean() const;
+  std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
-  const std::vector<float>& mlu_first_conv_std() const;
 #endif
  // XPU only, set the size of the workspace memory from L3 cache for the

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,11 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
 DEFINE_string(model_dir, "", "");
 namespace paddle {

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -54,7 +54,8 @@ const std::string& ActivationTypeToStr(ActivationType act) {
                                           "Sigmoid",
                                           "Tanh",
                                           "Swish",
-                                           "Exp"};
+                                           "Exp",
+                                           "ThresholdedRelu"};
  auto x = static_cast<int>(act);
  CHECK_LT(x, static_cast<int>(ActivationType::NUM));
  return act2string[x];
@@ -74,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "bm",
                                              "mlu",
                                              "rknpu",
-                                              "apu"};
+                                              "apu",
+                                              "huawei_ascend_npu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -117,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kBM",
                                              "kMLU",
                                              "kRKNPU",
-                                              "kAPU"};
+                                              "kAPU",
+                                              "kHuaweiAscendNPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -162,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kMLU),
                                               TARGET(kAPU),
                                               TARGET(kRKNPU),
-                                               TARGET(kFPGA)});
+                                               TARGET(kFPGA),
+                                               TARGET(kHuaweiAscendNPU)});
  if (target == TARGET(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -57,7 +57,8 @@ enum class TargetType : int {
  kMLU = 11,
  kRKNPU = 12,
  kAPU = 13,
-  NUM = 14,  // number of fields.
+  kHuaweiAscendNPU = 14,
+  NUM = 15,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,
@@ -106,7 +107,8 @@ enum class ActivationType : int {
  kAbs = 9,
  kHardSwish = 10,
  kReciprocal = 11,
-  NUM = 12,
+  kThresholdedRelu = 12,
+  NUM = 13,
 };
 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -26,7 +26,9 @@ USE_MIR_PASS(argument_type_display_pass);
 USE_MIR_PASS(runtime_context_assign_pass);
 USE_MIR_PASS(graph_visualize_pass);
+USE_MIR_PASS(remove_tf_redundant_ops_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
+USE_MIR_PASS(lite_conv_conv_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
@@ -46,14 +48,18 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
+USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
      .value("MLU", TargetType::kMLU)
      .value("RKNPU", TargetType::kRKNPU)
      .value("APU", TargetType::kAPU)
+      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
      .value("Any", TargetType::kAny);
  // PrecisionType

--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
  }
  auto* image_tensor = predictor.GetInput(1);
  image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
-  data = image_tensor->mutable_data<float>();
+  auto* data_1 = image_tensor->mutable_data<int>();
-  data[0] = FLAGS_im_height;
+  data_1[0] = FLAGS_im_height;
-  data[1] = FLAGS_im_width;
+  data_1[1] = FLAGS_im_width;
  for (int i = 0; i < FLAGS_warmup; ++i) {
    predictor.Run();

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -83,6 +83,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      conv5x5s2_depthwise_int8.cc
      conv5x5s2_depthwise_fp32.cc
      conv3x3_winograd_fp32_c4.cc
+      conv3x3_winograd_int8.cc
      conv_winograd_3x3.cc
      conv_impl.cc
      softmax.cc
@@ -126,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      split_merge_lod_tenosr.cc
      reduce_prod.cc
      lstm.cc
+      clip.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -753,23 +753,15 @@ void act_abs<float>(const float* din, float* dout, int size, int threads) {
  }
 }
-#ifdef LITE_WITH_TRAIN
 template <>
-void act_square_grad(const float* din,
+void act_thresholded_relu<float>(
-                     const float* dout_grad,
+    const float* din, float* dout, int size, float threshold, int threads) {
-                     float* din_grad,
-                     int size,
-                     int threads) {
-  const float* ptr_out_grad = dout_grad;
-  float* ptr_in_grad = din_grad;
  for (int i = 0; i < size; ++i) {
-    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
+    dout[0] = (din[0] > threshold ? din[0] : 0.f);
-    ptr_out_grad++;
-    ptr_in_grad++;
    din++;
+    dout++;
  }
 }
-#endif
 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -86,11 +86,9 @@ void act_reciprocal(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_abs(const T* din, T* dout, int size, int threads);
-#ifdef LITE_WITH_TRAIN
 template <typename T>
-void act_square_grad(
+void act_thresholded_relu(
-    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
+    const T* din, T* dout, int size, float threshold, int threads);
-#endif
 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
  selected_ids->Resize(dims);
  selected_scores->Resize(dims);
  if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
  }
  auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
  auto *selected_scores_data = selected_scores->mutable_data<float>();

--- a/lite/backends/arm/math/clip.cc
+++ b/lite/backends/arm/math/clip.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/arm/math/clip.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/saturate.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output) {
+  float tmp;
+  for (int64_t i = 0; i < num; i++) {
+    tmp = *input;
+    tmp = tmp > min ? tmp : min;
+    *output = tmp < max ? tmp : max;
+    input++;
+    output++;
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/clip.h
+++ b/lite/backends/arm/math/clip.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -1245,7 +1245,7 @@ void weight_trans_c4_8x8(
  for (int i = 0; i < ch_out * ch_in * 64; ++i) {
    int new_c = i % 64;
    int new_oc = i / ch_in / 64 / 4;
-    int new_ic = i / 64 % (ch_in * 4) % ch_in;
+    int new_ic = i / 64 % ch_in;
    int new_inner = i / ch_in / 64 % 4;
    int dest_ind =
        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
@@ -1302,7 +1302,7 @@ void weight_trans_c4_4x4(
  for (int i = 0; i < ch_out * ch_in * 16; ++i) {
    int new_c = i % 16;
    int new_oc = i / ch_in / 16 / 4;
-    int new_ic = i / 16 % (ch_in * 4) % ch_in;
+    int new_ic = i / 16 % ch_in;
    int new_inner = i / ch_in / 16 % 4;
    int dest_ind =
        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;

--- a/lite/backends/arm/math/conv3x3_winograd_int8.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_int8.cc
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
  }
  return true;
 }
+// for example: m = 4, n = 4
+// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
+// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
+/*
+  m = 8 n = 8: 0 1 2 3 4 5 6 7           0 8 16 24 32 40 48 56
+               16 17 18 19 20 21 22 23   2 10 18 26 34 42 50 58
+               24 25 26 27 28 29 30 31   3 11 19 27 35 43 51 59
+               32 33 34 35 36 37 38 39   4 12 20 28 36 44 52 60           ...
+    }
+  }
+*/
+template <typename Dtype>
+void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // n % 4 == 0 && m % 4 == 0
+  // n * m ==> n * m data trans
+  int offset_m = m << 2;
+  const Dtype* din_ptr = din;
+  Dtype* dout_ptr = dout;
+  for (int i = 0; i < n; i += 4) {
+    Dtype* out_ptr0 = dout_ptr;
+    Dtype* out_ptr1 = dout_ptr + m;
+    Dtype* out_ptr2 = out_ptr1 + m;
+    Dtype* out_ptr3 = out_ptr2 + m;
+    const Dtype* in_ptr0 = din_ptr;
+    const Dtype* in_ptr1 = din_ptr + m;
+    const Dtype* in_ptr2 = in_ptr1 + m;
+    const Dtype* in_ptr3 = in_ptr2 + m;
+    for (int j = 0; j < m; j += 4) {
+      float32x4_t vin0 = vld1q_f32(in_ptr0);
+      float32x4_t vin1 = vld1q_f32(in_ptr1);
+      float32x4_t vin2 = vld1q_f32(in_ptr2);
+      float32x4_t vin3 = vld1q_f32(in_ptr3);
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
+      in_ptr0 = in_ptr3 + m;
+      in_ptr1 = in_ptr3 + 2 * m;
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      in_ptr2 = in_ptr1 + m;
+      in_ptr3 = in_ptr1 + 2 * m;
+      vst1q_f32(out_ptr0, tmp0.val[0]);
+      vst1q_f32(out_ptr1, tmp0.val[1]);
+      out_ptr0 += 4;
+      out_ptr1 += 4;
+      vst1q_f32(out_ptr2, tmp2.val[0]);
+      vst1q_f32(out_ptr3, tmp2.val[1]);
+      out_ptr2 += 4;
+      out_ptr3 += 4;
+    }
+    dout_ptr += offset_m;
+    din_ptr += 4;
+  }
+}
+template <typename Dtype>
+void transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // nxm == mxn
+  // 4x4
+  int cnt_n = n >> 2;
+  int remain_n = n & 3;
+  int cnt_m = m >> 2;
+  int remain_m = m & 3;
+  int nn_num = n << 2;  // n * 4
+  int mm_num = m << 2;  // m * 4
+  for (int x = 0; x < cnt_n; x++) {
+    const Dtype* din_ptr0 = din + x * mm_num;
+    const Dtype* din_ptr1 = din_ptr0 + m;
+    const Dtype* din_ptr2 = din_ptr1 + m;
+    const Dtype* din_ptr3 = din_ptr2 + m;
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);  // a00 a01 a02 a03
+      float32x4_t din1 = vld1q_f32(din_ptr1);
+      float32x4_t din2 = vld1q_f32(din_ptr2);
+      float32x4_t din3 = vld1q_f32(din_ptr3);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      // a00 b00 c00 d00 a02 b02 c02 d02
+      // a01 b01 c01 d01 a03 b03 c03 d03
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      vst1q_f32(dout_ptr0, tmp0.val[0]);
+      vst1q_f32(dout_ptr1, tmp0.val[1]);
+      dout_ptr0 += nn_num;
+      vst1q_f32(dout_ptr2, tmp2.val[0]);
+      vst1q_f32(dout_ptr3, tmp2.val[1]);
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+      *dout_ptr0++ = *din_ptr1++;
+      *dout_ptr0++ = *din_ptr2++;
+      *dout_ptr0++ = *din_ptr3++;
+    }
+  }
+  const Dtype* din_ptr0 = din + cnt_n * mm_num;
+  dout = dout + cnt_n * 4;
+  for (int x = 0; x < remain_n; x++) {
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      din_ptr0 += 4;
+      *dout_ptr0 = din0[0];
+      *dout_ptr1 = din0[1];
+      dout_ptr0 += nn_num;
+      *dout_ptr2 = din0[2];
+      *dout_ptr3 = din0[3];
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+    }
+  }
+}
 /*preprocessing inputs
 * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
 * n = he - hs
@@ -3762,6 +3907,7 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
  int w_stride = we - ws;
  int valid_w = (we > width ? width : we) - ws;
  int cnt = valid_w / 4;
+  int remain = valid_w & 3;
  float32x4_t w_scale0 = vld1q_f32(scale);
  float32x4_t w_scale1 = vld1q_f32(scale + 4);
@@ -3818,10 +3964,10 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
                          flag_act,
                          alpha);
    }
-    if (we > width) {
+    if (remain > 0) {
      int offset = 32 * cnt;
      din_hei_ptr = ptr_din + offset;
-      for (int j = ws + cnt * 4; j < width; ++j) {
+      for (int j = 0; j < remain; ++j) {
        if (flag_bias) {
          *(doutc0_ptr++) = cvt_kernel<Dtype>(
              din_hei_ptr[0], scale[0], bias[0], flag_act, alpha[0]);

--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -359,6 +359,35 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const float* bias,
                                const operators::ConvParam& param,
                                ARMContext* ctx);
+void input_trans_c8_4x4_int8(const int8_t* src,
+                             int src_stride,
+                             int src_h_stride,
+                             int16_t* dest,
+                             int dest_stride,
+                             int dest_h_stride);
+void output_trans_c8_post_2x4_int8(const int32_t* src,
+                                   int src_stride,
+                                   int src_h_stride,
+                                   int32_t* dest,
+                                   int dest_stride,
+                                   int dest_h_stride);
+void weight_trans_c8_4x4_int8(
+    int16_t* dest, const int8_t* src, int ic, int oc, void* workspace);
+template <typename Dtype>
+void conv_compute_2x2_3x3_int8(const int8_t* input,
+                               Dtype* output,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const int16_t* weight,
+                               const float* bias,
+                               const float* scale,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx);
 template <typename Dtype>
 void im2col(const Dtype* data_im,

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "lite/backends/arm/math/elementwise.h"
+#include <math.h>
 #include <algorithm>
 #include "lite/backends/arm/math/funcs.h"
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
  }
 }
+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                 const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
  }
 }
+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                           const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
 template <>
 void elementwise_max<float>(const float* dinx,
                            const float* diny,
@@ -1254,6 +1322,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_div<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    *dout = *dinx / *diny;
+    dout++;
+    dinx++;
+    diny++;
+  }
+}
 template <>
 void elementwise_div<float>(const float* dinx,
                            const float* diny,
@@ -1306,6 +1387,28 @@ void elementwise_div<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* din_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int p = 0; p < num; p++) {
+        *dout_ptr = *din_ptr / diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
 template <>
 void elementwise_div_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -1541,6 +1644,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
  }
 }
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const T* din_ptr = dinx + offset;
+      const T diny_data = diny[j];
+      T* dout_ptr = dout + offset;
+      int cnt = num >> 2;
+      int remain = num % 4;
+      for (int k = 0; k < cnt; ++k) {
+        register T dinx0 = din_ptr[0];
+        register T dinx1 = din_ptr[1];
+        register T dinx2 = din_ptr[2];
+        register T dinx3 = din_ptr[3];
+        dout_ptr[0] = dinx0 % diny_data;
+        dout_ptr[1] = dinx1 % diny_data;
+        dout_ptr[2] = dinx2 % diny_data;
+        dout_ptr[3] = dinx3 % diny_data;
+        din_ptr += 4;
+        dout_ptr += 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr++ = *din_ptr++ % diny_data;
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
+  int cnt = num >> 2;
+  int remain = num % 4;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const T* dinx_ptr = dinx + (i << 2);
+    const T* diny_ptr = diny + (i << 2);
+    T* dout_ptr = dout + (i << 2);
+    register T dinx0 = dinx_ptr[0];
+    register T dinx1 = dinx_ptr[1];
+    register T dinx2 = dinx_ptr[2];
+    register T dinx3 = dinx_ptr[3];
+    register T diny0 = diny_ptr[0];
+    register T diny1 = diny_ptr[1];
+    register T diny2 = diny_ptr[2];
+    register T diny3 = diny_ptr[3];
+    dout_ptr[0] = dinx0 % diny0;
+    dout_ptr[1] = dinx1 % diny1;
+    dout_ptr[2] = dinx2 % diny2;
+    dout_ptr[3] = dinx3 % diny3;
+  }
+  if (remain > 0) {
+    const T* dinx_ptr = dinx + (cnt << 2);
+    const T* diny_ptr = diny + (cnt << 2);
+    T* dout_ptr = dout + (cnt << 2);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
+    }
+  }
+}
+template void elementwise_mod<int64_t>(const int64_t* dinx,
+                                       const int64_t* diny,
+                                       int64_t* dout,
+                                       int num);
+template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
+                                                 const int64_t* diny,
+                                                 int64_t* dout,
+                                                 int batch,
+                                                 int channels,
+                                                 int num);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -253,6 +253,13 @@ template <typename T>
 void elementwise_div_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -25,6 +25,7 @@
 #include "lite/backends/arm/math/axpy.h"
 #include "lite/backends/arm/math/beam_search.h"
 #include "lite/backends/arm/math/box_coder.h"
+#include "lite/backends/arm/math/clip.h"
 #include "lite/backends/arm/math/col_im_transform.h"
 #include "lite/backends/arm/math/concat.h"
 #include "lite/backends/arm/math/conv_block_utils.h"

--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
@@ -2242,19 +2242,45 @@ void gemm_prepack_oth_int8(const int8_t* A_packed,
      Dtype* tmp1 = nullptr;
      Dtype* tmp2 = nullptr;
      Dtype* tmp3 = nullptr;
-      float32_t scale_local[4];
+      float32_t scale_local[4] = {0, 0, 0, 0};
      float32_t bias_local[4] = {0, 0, 0, 0};
      if (is_bias) {
-        bias_local[0] = bias[y];
+        if (y + 4 <= M) {
-        bias_local[1] = bias[y + 1];
+          bias_local[0] = bias[y];
-        bias_local[2] = bias[y + 2];
+          bias_local[1] = bias[y + 1];
-        bias_local[3] = bias[y + 3];
+          bias_local[2] = bias[y + 2];
+          bias_local[3] = bias[y + 3];
+        } else {
+          switch (M - y) {
+            case 3:
+              bias_local[2] = bias[y + 2];
+            case 2:
+              bias_local[1] = bias[y + 1];
+            case 1:
+              bias_local[0] = bias[y + 0];
+            default:
+              break;
+          }
+        }
      }
      if (scale) {
-        scale_local[0] = scale[y];
+        if (y + 4 <= M) {
-        scale_local[1] = scale[y + 1];
+          scale_local[0] = scale[y];
-        scale_local[2] = scale[y + 2];
+          scale_local[1] = scale[y + 1];
-        scale_local[3] = scale[y + 3];
+          scale_local[2] = scale[y + 2];
+          scale_local[3] = scale[y + 3];
+        } else {
+          switch (M - y) {
+            case 3:
+              scale_local[2] = scale[y + 2];
+            case 2:
+              scale_local[1] = scale[y + 1];
+            case 1:
+              scale_local[0] = scale[y + 0];
+            default:
+              break;
+          }
+        }
      }
      if (y + MBLOCK_INT8_OTH > M) {
        switch (y + MBLOCK_INT8_OTH - M) {

--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
--- a/lite/backends/arm/math/packed_sgemm_c4.h
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -54,6 +54,13 @@ void sgemm_prepack_c4_small(int M,
                            const float* B,
                            float* C,
                            ARMContext* ctx);
+void sgemm_prepack_c8_int16_small(int M,
+                                  int N,
+                                  int K,
+                                  const int16_t* A_packed,
+                                  const int16_t* B,
+                                  int32_t* C,
+                                  ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
              } else {
                if (pad_bottom > 1) {
                  coef_h = 1.f / 3;
-                } else if (pad_bottom = 1) {
+                } else if (pad_bottom == 1) {
                  coef_h = 0.5f;
                } else {
                  coef_h = 1.f;

--- a/lite/backends/arm/math/prior_box.cc
+++ b/lite/backends/arm/math/prior_box.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace arm {
 namespace math {
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 void* fast_malloc(size_t size) {
  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;

--- a/lite/backends/arm/math/sequence_pool.cc
+++ b/lite/backends/arm/math/sequence_pool.cc
@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
      memcpy(dout_ptr, din_ptr, width * sizeof(float));
      din_ptr += width;
      height = height - 1;
-      for (int h = 0; h < height; h++) {
+      int cnt_w = width >> 2;
-        for (int w = 0; w < width; ++w) {
+      int remain_w = width & 3;
-          dout_ptr[w] += din_ptr[w];
+      int cnt_h = height >> 2;
+      int remain_h = height & 3;
+      int stride = width << 2;
+      for (int w = 0; w < cnt_w; w++) {
+        const float* din_ptr0 = din_ptr + w * 4;
+        float32x4_t dout_val = vld1q_f32(dout_ptr);
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          float32x4_t din1 = vld1q_f32(din_ptr1);
+          float32x4_t din2 = vld1q_f32(din_ptr2);
+          float32x4_t din3 = vld1q_f32(din_ptr3);
+          dout_val = vaddq_f32(din0, dout_val);
+          float32x4_t tmp = vaddq_f32(din1, din2);
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          dout_val = vaddq_f32(din3, dout_val);
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          dout_val = vaddq_f32(tmp, dout_val);
        }
-        din_ptr += width;
+        for (int h = 0; h < remain_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          dout_val = vaddq_f32(din0, dout_val);
+          din_ptr0 += width;
+        }
+        vst1q_f32(dout_ptr, dout_val);
+        dout_ptr += 4;
+      }
+      const float* din_ptr00 = din_ptr + cnt_w * 4;
+      for (int w = 0; w < remain_w; w++) {
+        const float* din_ptr0 = din_ptr00 + w;
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          float tmp = din_ptr1[0] + din_ptr2[0];
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          *dout_ptr += din_ptr3[0];
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          *dout_ptr += tmp;
+        }
+        for (int h = 0; h < remain_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          din_ptr0 += width;
+        }
+        dout_ptr++;
      }
    }
  }
@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
      } else {
        memcpy(dout_ptr, din_ptr, width * sizeof(float));
        din_ptr += width;
-        int remain_h = height - 1;
+        height = height - 1;
-        for (int h = 0; h < remain_h; h++) {
+        int cnt_w = width >> 2;
-          for (int w = 0; w < width; w++) {
+        int remain_w = width & 3;
-            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
+        int cnt_h = height >> 2;
+        int remain_h = height & 3;
+        int stride = width << 2;
+        for (int w = 0; w < cnt_w; w++) {
+          const float* din_ptr0 = din_ptr + w * 4;
+          float32x4_t dout_val = vld1q_f32(dout_ptr);
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            float32x4_t din1 = vld1q_f32(din_ptr1);
+            float32x4_t din2 = vld1q_f32(din_ptr2);
+            float32x4_t din3 = vld1q_f32(din_ptr3);
+            dout_val = vmaxq_f32(din0, dout_val);
+            float32x4_t tmp = vmaxq_f32(din1, din2);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            dout_val = vmaxq_f32(din3, dout_val);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            dout_val = vmaxq_f32(tmp, dout_val);
          }
-          din_ptr += width;
+          for (int h = 0; h < remain_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            dout_val = vmaxq_f32(din0, dout_val);
+            din_ptr0 += width;
+          }
+          vst1q_f32(dout_ptr, dout_val);
+          dout_ptr += 4;
+        }
+        const float* din_ptr00 = din_ptr + cnt_w * 4;
+        for (int w = 0; w < remain_w; w++) {
+          const float* din_ptr0 = din_ptr00 + w;
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            *dout_ptr += din_ptr0[0];
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            float tmp = std::max(din_ptr1[0], din_ptr2[0]);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            *dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            *dout_ptr = std::max(*dout_ptr, tmp);
+          }
+          for (int h = 0; h < remain_h; h++) {
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            din_ptr0 += width;
+          }
+          dout_ptr++;
        }
      }
    }

--- a/lite/backends/arm/math/softmax.cc
+++ b/lite/backends/arm/math/softmax.cc
@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
    }
    float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
    float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
      max_data = std::max(max_data, din_max_ptr[0]);
      din_max_ptr++;
    }
@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
    float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
    float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
      dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
      sum_data += dout_sum_ptr[0];
      din_sum_ptr++;

--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -41,6 +41,8 @@
        << "CUDA: " << cudaGetErrorString(e);                \
  }
+#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
 #define CUBLAS_CALL(func)                                        \
  {                                                              \
    auto e = (func);                                             \
@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    case CUDNN_STATUS_VERSION_MISMATCH:
+      return "CUDNN_STATUS_VERSION_MISMATCH";
 #endif
  }
  return "Unknown cudnn status";

--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -11,8 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gru_forward  SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
+nv_library(cuda_sequence2batch  SRCS sequence2batch.cu DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
+nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
 set (
 math_cuda
@@ -23,8 +28,13 @@ set (
 cuda_transpose
 cuda_elementwise
 cudnn_pool
+ cuda_gru_forward
+ cuda_sequence2batch
 cuda_gemm
 cuda_batched_gemm
+ cuda_strided_gemm
+ cuda_sequence_padding
+ cuda_bias
 )
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/activation.cu
+++ b/lite/backends/cuda/math/activation.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <iostream>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/backends/cuda/math/activation.h"
 #include "lite/backends/cuda/math/utils.h"
@@ -21,6 +22,20 @@ namespace lite {
 namespace cuda {
 namespace math {
+ActivationType GetActiveType(const std::string& act) {
+  if (act == "sigmoid") {
+    return kSigmoid;
+  } else if (act == "relu") {
+    return kReLU;
+  } else if (act == "tanh") {
+    return kTanh;
+  } else if (act == "identify") {
+    return kIdentity;
+  } else {
+    LOG(FATAL) << "not supported activation: " << act;
+  }
+}
 template <typename T>
 __global__ void relu_kernel(const int num,
                            const float alpha,
@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
 template void bias_relu(
    int, const float*, const float* bias, float*, float, cudaStream_t);
+// ------------- sigmoid -------------
+template <typename T>
+__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+#if __CUDA_ARCH__ >= 350
+    out[i] = static_cast<T>(1.0f) /
+             (static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
+#else
+    out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
+#endif
+  }
+}
+template <>
+__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half tmp = __float2half(1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __hdiv(
+        tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
+#else
+    out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
+#endif
+  }
+}
+template <>
+__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half2 tmp = __floats2half2_rn(1.0f, 1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __h2div(tmp,
+                     __hadd2(tmp,
+                             h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
+                                           __ldg(in + i)))));
+#else
+    out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
+    out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
+#endif
+  }
+}
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
+  sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+      num, din, dout);
+  CUDA_POST_KERNEL_CHECK;
+}
+template <>
+void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
+  if (num % 2 == 0) {
+    const half2* din2 = reinterpret_cast<const half2*>(din);
+    half2* dout2 = reinterpret_cast<half2*>(dout);
+    sigmoid_kernel<
+        half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
+        num / 2, din2, dout2);
+  } else {
+    sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+        num, din, dout);
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+template void sigmoid(const int num,
+                      const float* din,
+                      float* dout,
+                      cudaStream_t stream);
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/activation.h
+++ b/lite/backends/cuda/math/activation.h
@@ -17,11 +17,22 @@
 #include <cuda_runtime.h>
 #include <string>
+#include "lite/utils/cp_logging.h"
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+ActivationType GetActiveType(const std::string& act);
 // fp32 and half
 template <typename T>
 void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
                    const void* scale,
                    cudaStream_t stream);
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/bias.cu
+++ b/lite/backends/cuda/math/bias.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/bias.h"
+#include <iostream>
+#include "lite/backends/cuda/cuda_utils.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+template <>
+__global__ void RowwiseAddKernel(
+    const half* a, const half* b, half* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = __hadd(a[i], b[w]);
+  }
+}
+template <typename T>
+void RowwiseAdd<T>::operator()(const T* input,
+                               const T* bias,
+                               T* output,
+                               const int width,
+                               const int count,
+                               const cudaStream_t& stream) {
+  RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      input, bias, output, width, count);
+  CUDA_POST_KERNEL_CHECK;
+}
+template struct RowwiseAdd<float>;
+template struct RowwiseAdd<half>;
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/bias.h
+++ b/lite/backends/cuda/math/bias.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <typename T>
+struct RowwiseAdd {
+  void operator()(const T* input,
+                  const T* bias,
+                  T* output,
+                  const int width,
+                  const int count,
+                  const cudaStream_t& stream);
+};
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                              search_func);
  } else {
-    CUDNN_CHECK(
+    int requestedAlgoCount = 1;
-        cudnnGetConvolutionForwardAlgorithm(this->handle_,
+    int returnedAlgoCount;
-                                            this->input_desc_,
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
-                                            this->filter_desc_,
+                                                       this->input_desc_,
-                                            this->conv_desc_,
+                                                       this->filter_desc_,
-                                            this->output_desc_,
+                                                       this->conv_desc_,
-                                            this->preference_,
+                                                       this->output_desc_,
-                                            this->workspace_limit_bytes_,
+                                                       requestedAlgoCount,
-                                            &this->fwd_algo_));
+                                                       &returnedAlgoCount,
+                                                       &this->algo_perf_));
+    this->fwd_algo_ = this->algo_perf_.algo;
  }
  CUDNN_CHECK(
      cudnnGetConvolutionForwardWorkspaceSize(this->handle_,

--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
@@ -81,6 +81,7 @@ class CudnnConv2DBase {
  cudaStream_t stream_;
  cudnnHandle_t handle_;
  cudnnConvolutionFwdAlgo_t fwd_algo_;
+  cudnnConvolutionFwdAlgoPerf_t algo_perf_;
  cudnnTensorDescriptor_t input_desc_;
  cudnnTensorDescriptor_t output_desc_;
  cudnnTensorDescriptor_t bias_desc_;
@@ -98,8 +99,6 @@ class CudnnConv2DBase {
  const bool use_tensor_core_ = true;
  const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
-  const cudnnConvolutionFwdPreference_t preference_ =
-      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
  // For int8
  Tensor temp_tensor_;

--- a/lite/backends/cuda/math/gru_forward.cu
+++ b/lite/backends/cuda/math/gru_forward.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/core/device_info.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  T prev_out = 0;
+  T reset_out_val;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T reset_gate_value = gate_value[frame_idx + frame_size * 1];
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+  if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+  reset_out_val = prev_out * reset_gate_value;
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+template <>
+__global__ void GruForwardResetOutput(
+    half* gate_value,
+    half* reset_output_value,
+    half* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  half prev_out = 0;
+  half reset_out_val;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half reset_gate_value = gate_value[frame_idx + frame_size * 1];
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+  if (active_gate == ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+#if __CUDA_ARCH__ >= 530
+  reset_out_val = __hmul(prev_out, reset_gate_value);
+#else
+  reset_out_val =
+      __float2half(__half2float(prev_out) * __half2float(reset_gate_value));
+#endif
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+  T output;
+  T prev_out = 0;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T state_frame_value = gate_value[frame_idx + frame_size * 2];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+  if (origin_mode) {
+    output = update_gate_value * prev_out + state_frame_value -
+             update_gate_value * state_frame_value;
+  } else {
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+  }
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+template <>
+__global__ void GruForwardFinalOutput(
+    half* gate_value,
+    half* prev_output_value,
+    half* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+  half output;
+  half prev_out = 0;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half state_frame_value = gate_value[frame_idx + frame_size * 2];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+  if (origin_mode) {
+#if __CUDA_ARCH__ >= 530
+    output =
+        __hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
+               __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(state_frame_value) -
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  } else {
+#if __CUDA_ARCH__ >= 530
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+    output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
+                    __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(prev_out) -
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  }
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+template __global__ void GruForwardFinalOutput<float>(
+    float* gate_value,
+    float* prev_output_value,
+    float* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+template __global__ void GruForwardResetOutput<float>(
+    float* gate_value,
+    float* reset_output_value,
+    float* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/gru_forward.h
+++ b/lite/backends/cuda/math/gru_forward.h
--- a/lite/backends/cuda/math/scale.cu
+++ b/lite/backends/cuda/math/scale.cu
@@ -22,10 +22,6 @@ namespace lite {
 namespace cuda {
 namespace math {
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
 template <typename T>
 __global__ void scale_kernel(int count,
                             const T* in_data,
@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
 template <typename T>
 __global__ void scale_kernel(
    int count, const T* in_data, T* out_data, const T scale, const T bias) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
 }
@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
 }
 template <typename T>
-void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
+  CUDA_POST_KERNEL_CHECK;
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
 }
 template <typename T>
@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
+  CUDA_POST_KERNEL_CHECK;
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
 }
-template void scale(int num, const float*, float*, float, cudaStream_t, float);
+template void scale(int num, const float*, float*, float, float, cudaStream_t);
 template void scale(int num, const float*, float*, float, float);
 }  // namespace math

--- a/lite/backends/cuda/math/scale.h
+++ b/lite/backends/cuda/math/scale.h
@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
                     cudaStream_t stream);
 template <typename T>
-void scale(
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
-    int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
 template <typename T>
 void scale(int num, const T* in, T* out, T scale, T bias = 0);

--- a/lite/backends/cuda/math/sequence2batch.cu
+++ b/lite/backends/cuda/math/sequence2batch.cu
--- a/lite/backends/cuda/math/sequence2batch.h
+++ b/lite/backends/cuda/math/sequence2batch.h
--- a/lite/backends/cuda/math/sequence_padding.cu
+++ b/lite/backends/cuda/math/sequence_padding.cu
--- a/lite/backends/cuda/math/sequence_padding.h
+++ b/lite/backends/cuda/math/sequence_padding.h
--- a/lite/backends/cuda/math/strided_gemm.cc
+++ b/lite/backends/cuda/math/strided_gemm.cc
--- a/lite/backends/cuda/math/strided_gemm.h
+++ b/lite/backends/cuda/math/strided_gemm.h
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
  TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
 }
-// template <typename T>
-// void Transpose<T>::transpose(T* dst,
-//                             const T* src,
-//                             const std::vector<int>& src_dims,
-//                             const std::vector<int>& axes,
-//                             cudaStream_t* stream) {
-//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
-//  std::transform(
-//      src_dims.begin(),
-//      src_dims.end(),
-//      _src_dims.begin(),
-//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
-//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
-//  stream);
-//}
 template class Transpose<int8_t>;
 template class Transpose<float>;
+template class Transpose<half>;
 }  // namespace math
 }  // namespace cuda

--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/target_wrapper.h"
 namespace paddle {
@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
  static size_t num_devices();
  static size_t maximum_stream() { return 0; }
+  static int GetComputeCapability() {
+    int dev_id = GetCurDevice();
+    int major, minor;
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, dev_id));
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, dev_id));
+    return major * 10 + minor;
+  }
  static size_t GetCurDevice() {
    int dev_id;
    cudaGetDevice(&dev_id);

--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
--- a/lite/backends/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt
--- a/lite/backends/huawei_ascend_npu/device.cc
+++ b/lite/backends/huawei_ascend_npu/device.cc
--- a/lite/backends/huawei_ascend_npu/device.h
+++ b/lite/backends/huawei_ascend_npu/device.h
--- a/lite/backends/huawei_ascend_npu/model_client.cc
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
--- a/lite/backends/huawei_ascend_npu/model_client.h
+++ b/lite/backends/huawei_ascend_npu/model_client.h
--- a/lite/backends/huawei_ascend_npu/utils.h
+++ b/lite/backends/huawei_ascend_npu/utils.h
--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
--- a/lite/backends/opencl/cl_wrapper.h
+++ b/lite/backends/opencl/cl_wrapper.h
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
--- a/lite/backends/x86/jit/benchmark.cc
+++ b/lite/backends/x86/jit/benchmark.cc
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
--- a/lite/backends/x86/jit/helper.cc
+++ b/lite/backends/x86/jit/helper.cc
--- a/lite/backends/x86/jit/helper.h
+++ b/lite/backends/x86/jit/helper.h
--- a/lite/backends/x86/jit/kernel_key.cc
+++ b/lite/backends/x86/jit/kernel_key.cc
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
--- a/lite/backends/x86/jit/test.cc
+++ b/lite/backends/x86/jit/test.cc
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
--- a/lite/backends/x86/math/cpu_vec.h
+++ b/lite/backends/x86/math/cpu_vec.h
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
--- a/lite/backends/x86/math/cross_entropy.h
+++ b/lite/backends/x86/math/cross_entropy.h
--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
--- a/lite/backends/x86/math/gru_compute.h
+++ b/lite/backends/x86/math/gru_compute.h
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
--- a/lite/backends/x86/math/lstm_compute.h
+++ b/lite/backends/x86/math/lstm_compute.h
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
--- a/lite/backends/x86/math/math_function.h
+++ b/lite/backends/x86/math/math_function.h
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
--- a/lite/backends/x86/math/math_function_test.cc
+++ b/lite/backends/x86/math/math_function_test.cc
--- a/lite/backends/x86/math/sampler.h
+++ b/lite/backends/x86/math/sampler.h
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
--- a/lite/backends/xpu/debug.h
+++ b/lite/backends/xpu/debug.h
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
--- a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ b/lite/core/mir/fusion/conv_bn_fuser.h
--- a/lite/core/mir/fusion/conv_conv_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_conv_fuse_pass.h
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.h
--- a/lite/core/mir/fusion/conv_conv_fuser.cc
+++ b/lite/core/mir/fusion/conv_conv_fuser.cc
--- a/lite/core/mir/fusion/conv_conv_fuser.h
+++ b/lite/core/mir/fusion/conv_conv_fuser.h
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
+++ b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
--- a/lite/fluid/data_type.h
+++ b/lite/fluid/data_type.h
--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
--- a/lite/fluid/rw_lock.h
+++ b/lite/fluid/rw_lock.h
--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
--- a/lite/gen_code/gen_code_test.cc
+++ b/lite/gen_code/gen_code_test.cc
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
--- a/lite/kernels/arm/axpy_compute_test.cc
+++ b/lite/kernels/arm/axpy_compute_test.cc
--- a/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/lite/kernels/arm/batch_norm_compute_test.cc
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
--- a/lite/kernels/arm/activation_grad_compute.cc
+++ b/lite/kernels/arm/activation_grad_compute.cc
--- a/lite/kernels/arm/clip_compute.h
+++ b/lite/kernels/arm/clip_compute.h
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
--- a/lite/kernels/arm/conditional_block_compute.h
+++ b/lite/kernels/arm/conditional_block_compute.h
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
--- a/lite/kernels/arm/decode_bboxes_compute_test.cc
+++ b/lite/kernels/arm/decode_bboxes_compute_test.cc
--- a/lite/kernels/arm/deformable_conv_compute.cc
+++ b/lite/kernels/arm/deformable_conv_compute.cc
--- a/lite/kernels/arm/deformable_conv_compute.h
+++ b/lite/kernels/arm/deformable_conv_compute.h
--- a/lite/kernels/arm/dropout_compute_test.cc
+++ b/lite/kernels/arm/dropout_compute_test.cc
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
--- a/lite/kernels/arm/group_norm_compute.cc
+++ b/lite/kernels/arm/group_norm_compute.cc
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/activation_grad_compute.h
--- a/lite/kernels/arm/layer_norm_compute_test.cc
+++ b/lite/kernels/arm/layer_norm_compute_test.cc
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
--- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/mul_compute_test.cc
+++ b/lite/kernels/arm/mul_compute_test.cc
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
--- a/lite/kernels/arm/softmax_compute.cc
+++ b/lite/kernels/arm/softmax_compute.cc
--- a/lite/kernels/arm/softmax_compute_test.cc
+++ b/lite/kernels/arm/softmax_compute_test.cc
--- a/lite/kernels/arm/split_compute_test.cc
+++ b/lite/kernels/arm/split_compute_test.cc
--- a/lite/kernels/arm/split_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/transpose_compute_test.cc
+++ b/lite/kernels/arm/transpose_compute_test.cc
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/assign_value_compute.cu
+++ b/lite/kernels/cuda/assign_value_compute.cu
--- a/lite/kernels/cuda/assign_value_compute.h
+++ b/lite/kernels/cuda/assign_value_compute.h
--- a/lite/kernels/cuda/assign_value_compute_test.cc
+++ b/lite/kernels/cuda/assign_value_compute_test.cc
--- a/lite/kernels/cuda/concat_compute_test.cc
+++ b/lite/kernels/cuda/concat_compute_test.cc
--- a/lite/kernels/cuda/dropout_compute.cc
+++ b/lite/kernels/cuda/dropout_compute.cc
--- a/lite/kernels/cuda/fc_compute.cu
+++ b/lite/kernels/cuda/fc_compute.cu
--- a/lite/kernels/cuda/fc_compute.h
+++ b/lite/kernels/cuda/fc_compute.h
--- a/lite/kernels/cuda/fc_compute_test.cc
+++ b/lite/kernels/cuda/fc_compute_test.cc
--- a/lite/kernels/cuda/gru_compute.cu
+++ b/lite/kernels/cuda/gru_compute.cu
--- a/lite/kernels/cuda/gru_compute.h
+++ b/lite/kernels/cuda/gru_compute.h
--- a/lite/kernels/cuda/gru_compute_test.cc
+++ b/lite/kernels/cuda/gru_compute_test.cc
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
--- a/lite/kernels/cuda/matmul_compute.cc
+++ b/lite/kernels/cuda/matmul_compute.cc
--- a/lite/kernels/cuda/matmul_compute.h
+++ b/lite/kernels/cuda/matmul_compute.h
--- a/lite/kernels/cuda/matmul_compute_test.cc
+++ b/lite/kernels/cuda/matmul_compute_test.cc
--- a/lite/kernels/cuda/scale_compute.cc
+++ b/lite/kernels/cuda/scale_compute.cc
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
--- a/lite/kernels/cuda/sequence_mask_compute_test.cc
+++ b/lite/kernels/cuda/sequence_mask_compute_test.cc
--- a/lite/kernels/cuda/sequence_pad_compute.cu
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
--- a/lite/kernels/cuda/sequence_pad_compute.h
+++ b/lite/kernels/cuda/sequence_pad_compute.h
--- a/lite/kernels/cuda/sequence_pad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_pad_compute_test.cc
--- a/lite/kernels/cuda/sequence_unpad_compute.cu
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.h
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
--- a/lite/kernels/cuda/sequence_unpad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc
--- a/lite/kernels/cuda/sigmoid_compute.cu
+++ b/lite/kernels/cuda/sigmoid_compute.cu
--- a/lite/kernels/cuda/sigmoid_compute.h
+++ b/lite/kernels/cuda/sigmoid_compute.h
--- a/lite/kernels/cuda/sigmoid_compute_test.cc
+++ b/lite/kernels/cuda/sigmoid_compute_test.cc
--- a/lite/kernels/cuda/topk_pooling_compute.cu
+++ b/lite/kernels/cuda/topk_pooling_compute.cu
--- a/lite/kernels/cuda/topk_pooling_compute.h
+++ b/lite/kernels/cuda/topk_pooling_compute.h
--- a/lite/kernels/cuda/topk_pooling_compute_test.cc
+++ b/lite/kernels/cuda/topk_pooling_compute_test.cc
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/activation_grad_compute.cc
+++ b/lite/kernels/host/activation_grad_compute.cc
--- a/lite/kernels/host/activation_grad_compute.h
+++ b/lite/kernels/host/activation_grad_compute.h
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
--- a/lite/kernels/host/conditional_block_compute.h
+++ b/lite/kernels/host/conditional_block_compute.h
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
--- a/lite/kernels/host/print_compute.h
+++ b/lite/kernels/host/print_compute.h
--- a/lite/kernels/host/retinanet_detection_output_compute.cc
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
--- a/lite/kernels/host/retinanet_detection_output_compute.h
+++ b/lite/kernels/host/retinanet_detection_output_compute.h
--- a/lite/kernels/host/where_index_compute.cc
+++ b/lite/kernels/host/where_index_compute.cc
--- a/lite/kernels/host/where_index_compute.h
+++ b/lite/kernels/host/where_index_compute.h
--- a/lite/kernels/host/where_index_compute_test.cc
+++ b/lite/kernels/host/where_index_compute_test.cc
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/host/while_compute.h
+++ b/lite/kernels/host/while_compute.h
--- a/lite/kernels/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.h
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/argmax_op.cc
+++ b/lite/kernels/mlu/bridges/argmax_op.cc
--- a/lite/kernels/mlu/bridges/argmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/argmax_op_test.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/cast_op.cc
+++ b/lite/kernels/mlu/bridges/cast_op.cc
--- a/lite/kernels/mlu/bridges/cast_op_test.cc
+++ b/lite/kernels/mlu/bridges/cast_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/dropout_op.cc
+++ b/lite/kernels/mlu/bridges/dropout_op.cc
--- a/lite/kernels/mlu/bridges/dropout_op_test.cc
+++ b/lite/kernels/mlu/bridges/dropout_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/flatten_op.cc
+++ b/lite/kernels/mlu/bridges/flatten_op.cc
--- a/lite/kernels/mlu/bridges/flatten_op_test.cc
+++ b/lite/kernels/mlu/bridges/flatten_op_test.cc
--- a/lite/kernels/mlu/bridges/gather_op.cc
+++ b/lite/kernels/mlu/bridges/gather_op.cc
--- a/lite/kernels/mlu/bridges/gather_op_test.cc
+++ b/lite/kernels/mlu/bridges/gather_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/layout_op.cc
+++ b/lite/kernels/mlu/bridges/layout_op.cc
--- a/lite/kernels/mlu/bridges/layout_op_test.cc
+++ b/lite/kernels/mlu/bridges/layout_op_test.cc
--- a/lite/kernels/mlu/bridges/lrn_op.cc
+++ b/lite/kernels/mlu/bridges/lrn_op.cc
--- a/lite/kernels/mlu/bridges/lrn_op_test.cc
+++ b/lite/kernels/mlu/bridges/lrn_op_test.cc
--- a/lite/kernels/mlu/bridges/norm_op.cc
+++ b/lite/kernels/mlu/bridges/norm_op.cc
--- a/lite/kernels/mlu/bridges/norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/norm_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/reshape_op.cc
+++ b/lite/kernels/mlu/bridges/reshape_op.cc
--- a/lite/kernels/mlu/bridges/reshape_op_test.cc
+++ b/lite/kernels/mlu/bridges/reshape_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/mlu/bridges/slice_op.cc
+++ b/lite/kernels/mlu/bridges/slice_op.cc
--- a/lite/kernels/mlu/bridges/slice_op_test.cc
+++ b/lite/kernels/mlu/bridges/slice_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
--- a/lite/kernels/mlu/bridges/split_op.cc
+++ b/lite/kernels/mlu/bridges/split_op.cc
--- a/lite/kernels/mlu/bridges/split_op_test.cc
+++ b/lite/kernels/mlu/bridges/split_op_test.cc
--- a/lite/kernels/mlu/bridges/squeeze_op.cc
+++ b/lite/kernels/mlu/bridges/squeeze_op.cc
--- a/lite/kernels/mlu/bridges/squeeze_op_test.cc
+++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
--- a/lite/kernels/mlu/bridges/transpose_op.cc
+++ b/lite/kernels/mlu/bridges/transpose_op.cc
--- a/lite/kernels/mlu/bridges/transpose_op_test.cc
+++ b/lite/kernels/mlu/bridges/transpose_op_test.cc
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/npu/bridges/utility.cc
+++ b/lite/kernels/npu/bridges/utility.cc
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/opencl/transpose_image_compute.cc
+++ b/lite/kernels/opencl/transpose_image_compute.cc
--- a/lite/kernels/opencl/transpose_image_compute_test.cc
+++ b/lite/kernels/opencl/transpose_image_compute_test.cc
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
--- a/lite/kernels/x86/activation_compute_test.cc
+++ b/lite/kernels/x86/activation_compute_test.cc
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
--- a/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/lite/kernels/x86/batch_norm_compute_test.cc
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
--- a/lite/kernels/x86/concat_compute_test.cc
+++ b/lite/kernels/x86/concat_compute_test.cc
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
--- a/lite/kernels/x86/dropout_compute_test.cc
+++ b/lite/kernels/x86/dropout_compute_test.cc
--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ b/lite/kernels/x86/elementwise_compute_test.cc
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
--- a/lite/kernels/x86/gelu_compute_test.cc
+++ b/lite/kernels/x86/gelu_compute_test.cc
--- a/lite/kernels/x86/gru_compute_test.cc
+++ b/lite/kernels/x86/gru_compute_test.cc
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/x86/matmul_compute_test.cc
+++ b/lite/kernels/x86/matmul_compute_test.cc
--- a/lite/kernels/x86/mul_compute_test.cc
+++ b/lite/kernels/x86/mul_compute_test.cc
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
--- a/lite/kernels/x86/relu_compute_test.cc
+++ b/lite/kernels/x86/relu_compute_test.cc
--- a/lite/kernels/x86/reshape_compute_test.cc
+++ b/lite/kernels/x86/reshape_compute_test.cc
--- a/lite/kernels/x86/scale_compute_test.cc
+++ b/lite/kernels/x86/scale_compute_test.cc
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
--- a/lite/kernels/x86/search_grnn_compute_test.cc
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_expand_as_compute_test.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ b/lite/kernels/x86/sequence_pool_compute_test.cc
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/sgd_compute.cc
+++ b/lite/kernels/x86/sgd_compute.cc
--- a/lite/kernels/x86/shape_compute_test.cc
+++ b/lite/kernels/x86/shape_compute_test.cc
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ b/lite/kernels/x86/softmax_compute_test.cc
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
--- a/lite/kernels/x86/tanh_compute_test.cc
+++ b/lite/kernels/x86/tanh_compute_test.cc
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
--- a/lite/kernels/x86/transpose_compute_test.cc
+++ b/lite/kernels/x86/transpose_compute_test.cc
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
--- a/lite/kernels/xpu/__xpu__mmdnn_compute.cc
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
--- a/lite/kernels/xpu/__xpu__search_attention_compute.cc
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
--- a/lite/kernels/xpu/__xpu__search_attention_compute.h
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.h
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/concat_compute.cc
+++ b/lite/kernels/xpu/concat_compute.cc
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/match_matrix_tensor_compute.cc
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
--- a/lite/kernels/xpu/match_matrix_tensor_compute.h
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.h
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/search_fc_compute.cc
+++ b/lite/kernels/xpu/search_fc_compute.cc
--- a/lite/kernels/xpu/search_fc_compute.h
+++ b/lite/kernels/xpu/search_fc_compute.h
--- a/lite/kernels/xpu/search_grnn_compute.cc
+++ b/lite/kernels/xpu/search_grnn_compute.cc
--- a/lite/kernels/xpu/search_grnn_compute.h
+++ b/lite/kernels/xpu/search_grnn_compute.h
--- a/lite/kernels/xpu/sequence_arithmetic_compute.cc
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
--- a/lite/kernels/xpu/sequence_arithmetic_compute.h
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.h
--- a/lite/kernels/xpu/sequence_concat_compute.cc
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
--- a/lite/kernels/xpu/sequence_concat_compute.h
+++ b/lite/kernels/xpu/sequence_concat_compute.h
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
--- a/lite/kernels/xpu/sequence_pool_compute.h
+++ b/lite/kernels/xpu/sequence_pool_compute.h
--- a/lite/kernels/xpu/sequence_reverse_compute.cc
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
--- a/lite/kernels/xpu/sequence_reverse_compute.h
+++ b/lite/kernels/xpu/sequence_reverse_compute.h
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/kernels/xpu/var_conv_2d_compute.cc
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
--- a/lite/kernels/xpu/var_conv_2d_compute.h
+++ b/lite/kernels/xpu/var_conv_2d_compute.h
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
--- a/lite/model_parser/base/apis.h
+++ b/lite/model_parser/base/apis.h
--- a/lite/model_parser/base/block_desc.h
+++ b/lite/model_parser/base/block_desc.h
--- a/lite/model_parser/base/op_desc.h
+++ b/lite/model_parser/base/op_desc.h
--- a/lite/model_parser/base/program_desc.h
+++ b/lite/model_parser/base/program_desc.h
--- a/lite/model_parser/base/proto_desc.h
+++ b/lite/model_parser/base/proto_desc.h
--- a/lite/model_parser/base/traits.h
+++ b/lite/model_parser/base/traits.h
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
--- a/lite/model_parser/compatibility.h
+++ b/lite/model_parser/compatibility.h
--- a/lite/model_parser/compatibility_test.cc
+++ b/lite/model_parser/compatibility_test.cc
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/compatible_pb.h
+++ b/lite/model_parser/compatible_pb.h
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/cpp_desc.h
+++ b/lite/model_parser/cpp_desc.h
--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/desc_apis.h
--- a/lite/model_parser/flatbuffers/CMakeLists.txt
+++ b/lite/model_parser/flatbuffers/CMakeLists.txt
--- a/lite/model_parser/flatbuffers/block_desc.cc
+++ b/lite/model_parser/flatbuffers/block_desc.cc
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
--- a/lite/model_parser/flatbuffers/framework.fbs
+++ b/lite/model_parser/flatbuffers/framework.fbs
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
--- a/lite/model_parser/flatbuffers/io.h
+++ b/lite/model_parser/flatbuffers/io.h
--- a/lite/model_parser/flatbuffers/op_desc.cc
+++ b/lite/model_parser/flatbuffers/op_desc.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.cc
+++ b/lite/model_parser/flatbuffers/program_desc.cc
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/var_desc.cc
+++ b/lite/model_parser/flatbuffers/var_desc.cc
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
--- a/lite/model_parser/flatbuffers/vector_view_test.cc
+++ b/lite/model_parser/flatbuffers/vector_view_test.cc
--- a/lite/model_parser/cpp/CMakeLists.txt
+++ b/lite/model_parser/cpp/CMakeLists.txt
--- a/lite/model_parser/cpp/block_desc.cc
+++ b/lite/model_parser/cpp/block_desc.cc
--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/cpp/block_desc.h
--- a/lite/model_parser/cpp/op_desc.cc
+++ b/lite/model_parser/cpp/op_desc.cc
--- a/lite/model_parser/cpp/op_desc.h
+++ b/lite/model_parser/cpp/op_desc.h
--- a/lite/model_parser/cpp/program_desc.cc
+++ b/lite/model_parser/cpp/program_desc.cc
--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/cpp/program_desc.h
--- a/lite/model_parser/cpp/var_desc.cc
+++ b/lite/model_parser/cpp/var_desc.cc
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/naive_buffer/block_desc.h
+++ b/lite/model_parser/naive_buffer/block_desc.h
--- a/lite/model_parser/naive_buffer/combined_params_desc.h
+++ b/lite/model_parser/naive_buffer/combined_params_desc.h
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
--- a/lite/model_parser/naive_buffer/param_desc.h
+++ b/lite/model_parser/naive_buffer/param_desc.h
--- a/lite/model_parser/naive_buffer/program_desc.h
+++ b/lite/model_parser/naive_buffer/program_desc.h
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
--- a/lite/model_parser/pb/block_desc.h
+++ b/lite/model_parser/pb/block_desc.h
--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
--- a/lite/model_parser/pb/program_desc.h
+++ b/lite/model_parser/pb/program_desc.h
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__mmdnn_op.cc
+++ b/lite/operators/__xpu__mmdnn_op.cc
--- a/lite/operators/__xpu__mmdnn_op.h
+++ b/lite/operators/__xpu__mmdnn_op.h
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
--- a/lite/operators/__xpu__resnet_cbam_op.h
+++ b/lite/operators/__xpu__resnet_cbam_op.h
--- a/lite/operators/__xpu__search_attention_op.cc
+++ b/lite/operators/__xpu__search_attention_op.cc
--- a/lite/operators/__xpu__search_attention_op.h
+++ b/lite/operators/__xpu__search_attention_op.h
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
--- a/lite/operators/clip_op.cc
+++ b/lite/operators/clip_op.cc
--- a/lite/operators/clip_op.h
+++ b/lite/operators/clip_op.h
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/deformable_conv_op.cc
+++ b/lite/operators/deformable_conv_op.cc
--- a/lite/operators/deformable_conv_op.h
+++ b/lite/operators/deformable_conv_op.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/fake_quantize_dequantize_abs_max.cc
+++ b/lite/operators/fake_quantize_dequantize_abs_max.cc
--- a/lite/operators/fake_quantize_dequantize_abs_max.h
+++ b/lite/operators/fake_quantize_dequantize_abs_max.h
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/group_norm_op.cc
+++ b/lite/operators/group_norm_op.cc
--- a/lite/operators/group_norm_op.h
+++ b/lite/operators/group_norm_op.h
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
--- a/lite/operators/max_pool_with_index_op.h
+++ b/lite/operators/max_pool_with_index_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pixel_shuffle_op.cc
+++ b/lite/operators/pixel_shuffle_op.cc
--- a/lite/operators/pixel_shuffle_op.h
+++ b/lite/operators/pixel_shuffle_op.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/print_op.cc
+++ b/lite/operators/print_op.cc
--- a/lite/operators/print_op.h
+++ b/lite/operators/print_op.h
--- a/lite/operators/retinanet_detection_output_op.cc
+++ b/lite/operators/retinanet_detection_output_op.cc
--- a/lite/operators/retinanet_detection_output_op.h
+++ b/lite/operators/retinanet_detection_output_op.h
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
--- a/lite/operators/sequence_mask_op.cc
+++ b/lite/operators/sequence_mask_op.cc
--- a/lite/operators/sequence_mask_op.h
+++ b/lite/operators/sequence_mask_op.h
--- a/lite/operators/sequence_pad_op.cc
+++ b/lite/operators/sequence_pad_op.cc
--- a/lite/operators/sequence_pad_op.h
+++ b/lite/operators/sequence_pad_op.h
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
--- a/lite/operators/topk_pooling_op.cc
+++ b/lite/operators/topk_pooling_op.cc
--- a/lite/operators/topk_pooling_op.h
+++ b/lite/operators/topk_pooling_op.h
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/where_index_op.cc
+++ b/lite/operators/where_index_op.cc
--- a/lite/operators/where_index_op.h
+++ b/lite/operators/where_index_op.h
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
--- a/lite/tests/api/test_fpr_lite_xpu.cc
+++ b/lite/tests/api/test_fpr_lite_xpu.cc
--- a/lite/tests/api/test_mmdnn_lite_xpu.cc
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
--- a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
--- a/lite/tests/kernels/box_clip_compute_test.cc
+++ b/lite/tests/kernels/box_clip_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/clip_compute_test.cc
+++ b/lite/tests/kernels/clip_compute_test.cc
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/group_norm_compute_test.cc
+++ b/lite/tests/kernels/group_norm_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
--- a/lite/tests/kernels/multiclass_nms_compute_test.cc
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ b/lite/tests/kernels/roi_align_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/sequence_conv_compute_test.cc
+++ b/lite/tests/kernels/sequence_conv_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tests/math/deformable_conv_compute_test.cc
+++ b/lite/tests/math/deformable_conv_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
--- a/lite/tests/utils/tensor_utils.h
+++ b/lite/tests/utils/tensor_utils.h
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/all.h
+++ b/lite/utils/all.h
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/lite/utils/factory.h
+++ b/lite/utils/factory.h
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/md5.h
+++ b/lite/utils/md5.h
--- a/lite/utils/paddle_enforce.h
+++ b/lite/utils/paddle_enforce.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
--- a/flatbuffers @ 6df40a24
+++ b/flatbuffers @ 6df40a24