Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into stream_manage

37f606d2 · jiweibo · ca08360c · 3d0a45c3 · 37f606d2 · 37f606d2
517 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources

+#flatbuffers
+lite/model_parser/flatbuffers/framework_generated.h
+
 build*
+
+# hiai libs
+ai_ddk_lib*
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/protobuf-host"]
 	path = third-party/protobuf-host
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "third-party/flatbuffers"]
+	path = third-party/flatbuffers
+	url = https://github.com/google/flatbuffers.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
-lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." OFF)
+lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
 lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
@@ -168,6 +169,7 @@ if(LITE_WITH_RKNPU)
   include(device/rknpu)
 endif()

+include(external/flatbuffers)

 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -35,8 +35,12 @@ endif()
 if(NOT DEFINED ANDROID_API_LEVEL)
    set(ANDROID_API_LEVEL "23")
    if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+        if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
+            set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
+        else()
            set(ANDROID_API_LEVEL "22")
        endif()
+    endif()
 endif()

 # then check input arm abi

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
    return()
 endif()

+if(WITH_CUDA_FP16)
+  add_definitions("-DCUDA_WITH_FP16")
+endif()
+
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()

+if (CUDA_WITH_FP16)
+  STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
+endif()
+
 include_directories(${CUDA_INCLUDE_DIRS})
 if(NOT WITH_DSO)
    if(WIN32)

--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -39,7 +39,7 @@ else()
 endif()

 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
  NO_DEFAULT_PATH)

 if(NOT XPU_SDK_XPU_RT_FILE)

--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+INCLUDE(GNUInstallDirs)
+SET(LIBDIR "lib")
+if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
+  SET(LIBDIR "lib64")
+endif()
+
+SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
+SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
+SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
+SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
+IF(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ELSE(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
+
+if(NOT HOST_CXX_COMPILER)
+  set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
+endif()
+
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
+
+ExternalProject_Add(
+    extern_flatbuffers
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/flatbuffers.git"
+    GIT_TAG         "v1.12.0"
+    SOURCE_DIR      ${FLATBUFFERS_SOURCES_DIR}
+    PREFIX          ${FLATBUFFERS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
+                    -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+                    -DFLATBUFFERS_BUILD_TESTS=OFF
+                    ${CROSS_COMPILE_CMAKE_ARGS}
+                    ${OPTIONAL_ARGS}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
+    add_custom_command(TARGET extern_flatbuffers POST_BUILD
+            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
+            )
+  ENDIF()
+ENDIF(WIN32)
+ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
+ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
+
+SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
+
+function(register_generated_output file_name)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  list(APPEND tmp ${file_name})
+  set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
+endfunction(register_generated_output)
+
+function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT ${GEN_HEADER}
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            --cpp --gen-mutable --gen-object-api --reflect-names
+            --cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs
+            ${OPT}
+            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
+            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatbuffers
+    COMMENT "Run generation: '${GEN_HEADER}'")
+  register_generated_output(${GEN_HEADER})
+  add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
+endfunction()
+
+set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
+set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
+compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+include_directories(${FLATBUFFERS_INCLUDE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
+
--- a/lite/demo/cxx/train_demo/README.md
+++ b/lite/demo/cxx/train_demo/README.md
+# C++ Train Demo

-# Introduction
-  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+## Introduction

-  你可以通过book库中的
+我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+  
+你可以通过book库中的
 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
 和
 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
@@ -10,18 +12,16 @@
 其使用线性回归（Linear Regression）
 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。

-注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
-
-# Requirements
+## Requirements

 - 一部安卓手机，用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python
+- 装了Paddle (version >= 1.7.0) 的python

-# Quick start
+## Quick start

-## Step1 build paddle-lite
+### Step1 build paddle-lite

-请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：

 ```shell
 ## 配置环境
@@ -51,7 +51,7 @@ cd Paddle-Lite
 Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
 ```

-## Step2 编译lr_trainer
+### Step2 编译lr_trainer

 ```shell
 cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
@@ -64,7 +64,7 @@ bin/
 `-- demo_trainer
 ```

-## Step3 download model and run it!
+### Step3 download model and run it!

 在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：

@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```

-# 更多细节
+## 更多细节
 上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：

 ```shell
@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d

 如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。

-# 与Paddle训练结果做校对
+## 与Paddle训练结果做校对

-## 前10个Loss值
+### 前10个Loss值

 为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。

@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```

-## Loss 曲线
+### Loss 曲线

 控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。

-![lr_loss](image/lr_loss.png)
+![lr_loss](../images/lr_loss.png)

 如果想复现上述效果，paddle+python的运行命令为：


--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
 predictor = create_paddle_predictor(config)
 ```

-(3) 设置输入数据
+(3) 从图片读入数据
+
+```python
+image = Image.open('./example.jpg')
+resized_image = image.resize((224, 224), Image.BILINEAR)
+image_data = np.array(resized_image).flatten().tolist()
+```
+
+(4) 设置输入数据
+
 ```python
 input_tensor = predictor.get_input(0)
 input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
+input_tensor.set_float_data(image_data)
 ```

-(4) 执行预测
+(5) 执行预测
 ```python
 predictor.run()
 ```

-(5) 得到输出数据
+(6) 得到输出数据
 ```python
 output_tensor = predictor.get_output(0)
 print(output_tensor.shape())

--- a/lite/demo/cxx/train_demo/image/lr_loss.png
+++ b/lite/demo/cxx/train_demo/image/lr_loss.png
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -60,6 +60,13 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/rockchip_npu
  demo_guides/mediatek_apu

+.. toctree::
+  :maxdepth: 1
+  :caption: 训练示例（预览）
+  :name: sec-train_demo_guides
+
+  demo_guides/cpp_train_demo
+
 .. toctree::
  :maxdepth: 1
  :caption: API文档

--- a/docs/user_guides/Compile/iOS.md
+++ b/docs/user_guides/Compile/iOS.md
@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8                iOS预测库和头文件
 - 裁剪预测库方法（只编译模型中的kernel&OP，降低预测库体积）:

 ```shell
-./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
+./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
 ```
 ```shell
 --with_strip: (OFF|ON);   是否根据输入模型裁剪预测库，默认为OFF

--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -21,11 +21,11 @@ pip install paddlelite
 - 方法二: 下载opt可执行文件
 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择当前预测库对应版本的`opt`转化工具

-本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载
+本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载

 |版本 | Linux | MacOS|
 |---|---|---|
-| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
 |`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |

 - 方法三: 源码编译opt

--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -49,4 +49,4 @@ $ ./opt \

 ## 五. 测试工具

-为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -53,6 +53,8 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
    endif()
 endif()

@@ -242,7 +244,6 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_lib test_model_bin)

    add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-           COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)
-endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+endif()

 if (LITE_ON_TINY_PUBLISH)
    set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
@@ -15,8 +15,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
-    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
-    target_link_libraries(paddle_full_api_shared framework_proto)
+    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
+    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
+    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
@@ -70,7 +71,7 @@ else()
            set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
        endif()
        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -368,6 +369,9 @@ endif()

 if (LITE_WITH_PYTHON)
    add_subdirectory(python)
+    # add library for opt_base
+    lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+    add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 endif()

 if (LITE_ON_TINY_PUBLISH)
@@ -375,9 +379,6 @@ if (LITE_ON_TINY_PUBLISH)
 endif()


-# add library for opt_base
-lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
-add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)

 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
    message(STATUS "Compiling opt")

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
    # Unlike static library, module library has to link target to be able to work
    # as a single .so lib.
    target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    add_dependencies(paddle_lite_jni framework_fbs_header)
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -31,7 +32,7 @@ else()
    endif()
    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
    if (LITE_WITH_NPU)
        # Need to add HIAI runtime libs (libhiai.so) dependency
        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,18 +13,24 @@
 // limitations under the License.

 #include "lite/api/cxx_api.h"
+
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"

 namespace paddle {
 namespace lite {

+std::vector<std::string> GetAllOps() {
+  return OpLiteFactory::Global().GetAllOps();
+}
+
 void Predictor::SaveModel(const std::string &dir,
                          lite_api::LiteModelType model_type,
                          bool record_info) {
@@ -326,10 +332,8 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
    }
  }
  if (is_quantized_model) {
-#ifdef LITE_WITH_ARM
    inner_places.insert(inner_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});
-#endif
  }

  Program program(*desc.get(), scope_, inner_places);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -41,6 +41,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
    ".tailored_kernels_source_list";
 static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";

+std::vector<std::string> GetAllOps();
+
 /*
 * Predictor for inference, input a model, it will optimize and execute it.
 */

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -52,12 +52,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  if (!status_is_cloned_) {
 #ifdef LITE_WITH_MLU
    Env<TARGET(kMLU)>::Init();
-    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
                                          config.mlu_core_number(),
-                                             config.mlu_use_first_conv(),
-                                             config.mlu_first_conv_mean(),
-                                             config.mlu_first_conv_std(),
-                                             config.mlu_input_layout());
+                                          config.mlu_input_layout(),
+                                          config.mlu_firstconv_param());
 #endif  // LITE_WITH_MLU
    auto use_layout_preprocess_pass =
        config.model_dir().find("OPENCL_PRE_PRECESS");
@@ -75,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {

  mode_ = config.power_mode();
  threads_ = config.threads();
+#ifdef LITE_WITH_NPU
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
  int num_threads = config.x86_math_library_num_threads();

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -15,8 +15,6 @@
 #include "lite/api/light_api.h"
 #include <algorithm>
 #include <map>
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT

 namespace paddle {
 namespace lite {

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,9 @@
 // limitations under the License.

 #include "lite/api/paddle_api.h"
+
+#include <utility>
+
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
@@ -21,6 +24,13 @@
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/target_wrapper.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif

 namespace paddle {
 namespace lite_api {
@@ -106,6 +116,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
        data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
@@ -127,6 +144,13 @@ void Tensor::CopyToCpu(T *data) const {
    lite::TargetWrapperCuda::StreamSync(*io_stream_);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
@@ -148,6 +172,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);

+template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
+template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
+
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
 template void Tensor::CopyToCpu(int8_t *) const;
@@ -238,13 +267,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
 void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
  mlu_input_layout_ = layout;
 }
-void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
-  mlu_use_first_conv_ = use_first_conv;
-}
-void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
+                                        const std::vector<float> &std) {
  mlu_first_conv_mean_ = mean;
-}
-void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
  mlu_first_conv_std_ = std;
 }
 lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
@@ -252,18 +277,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 }
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
-bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
-const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
-  return mlu_first_conv_mean_;
-}
-const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
-  return mlu_first_conv_std_;
+std::pair<std::vector<float>, std::vector<float>>
+CxxConfig::mlu_firstconv_param() const {
+  return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
 }
 #endif

 void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+  lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
 #else
  LOG(WARNING) << "The invoking of the function "
                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
@@ -273,7 +295,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {

 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+  lite::TargetWrapperXPU::SetDev(dev_no);
 #else
  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -282,7 +304,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {

 void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
+  lite::TargetWrapperXPU::multi_encoder_precision = precision;
 #else
  LOG(WARNING) << "The invoking of the function "
                  "'set_xpu_multi_encoder_precision' is "

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -21,6 +21,7 @@
 #define PADDLE_LITE_API_H_
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle_place.h"  // NOLINT

@@ -174,9 +175,8 @@ class LITE_API CxxConfig : public ConfigBase {
  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
  int mlu_core_number_{1};
  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
-  bool mlu_use_first_conv_{false};
-  std::vector<float> mlu_first_conv_mean_;
-  std::vector<float> mlu_first_conv_std_;
+  std::vector<float> mlu_first_conv_mean_{};
+  std::vector<float> mlu_first_conv_std_{};
 #endif

 public:
@@ -232,24 +232,22 @@ class LITE_API CxxConfig : public ConfigBase {
  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
  // set MLU core number, which is used when compiling MLU kernels
  void set_mlu_core_number(int core_number);
-  // set MLU input layout. User can specify layout of input data to be NHWC,
-  // default is NCHW
-  void set_mlu_input_layout(DataLayoutType layout);
  // whether use MLU's first conv kernel. First conv is a special kernel
  // provided by MLU, its input is uint8, and also needs two 3-dimentional
  // vectors which save all inputs' mean and std values
-  void set_mlu_use_first_conv(bool use_first_conv);
-  // set the 3-dimentional mean vector used by MLU's first conv
-  void set_mlu_first_conv_mean(const std::vector<float>& mean);
-  // set the 3-dimentional std vector used by MLU's first conv
-  void set_mlu_first_conv_std(const std::vector<float>& std);
+  // set the 3-dimentional mean vector and 3-dimentional std vector used by
+  // MLU's first conv
+  void set_mlu_firstconv_param(const std::vector<float>& mean,
+                               const std::vector<float>& std);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);

  lite_api::MLUCoreVersion mlu_core_version() const;
  int mlu_core_number() const;
  DataLayoutType mlu_input_layout() const;
-  bool mlu_use_first_conv() const;
-  const std::vector<float>& mlu_first_conv_mean() const;
-  const std::vector<float>& mlu_first_conv_std() const;
+  // std::pair<mean, std>
+  std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
 #endif

  // XPU only, set the size of the workspace memory from L3 cache for the

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,11 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
+
 DEFINE_string(model_dir, "", "");

 namespace paddle {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -55,6 +55,8 @@ USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
+USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
  }
  auto* image_tensor = predictor.GetInput(1);
  image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
-  data = image_tensor->mutable_data<float>();
-  data[0] = FLAGS_im_height;
-  data[1] = FLAGS_im_width;
+  auto* data_1 = image_tensor->mutable_data<int>();
+  data_1[0] = FLAGS_im_height;
+  data_1[1] = FLAGS_im_width;

  for (int i = 0; i < FLAGS_warmup; ++i) {
    predictor.Run();

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -127,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      split_merge_lod_tenosr.cc
      reduce_prod.cc
      lstm.cc
+      clip.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -763,24 +763,6 @@ void act_thresholded_relu<float>(
  }
 }

-#ifdef LITE_WITH_TRAIN
-template <>
-void act_square_grad(const float* din,
-                     const float* dout_grad,
-                     float* din_grad,
-                     int size,
-                     int threads) {
-  const float* ptr_out_grad = dout_grad;
-  float* ptr_in_grad = din_grad;
-  for (int i = 0; i < size; ++i) {
-    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
-    ptr_out_grad++;
-    ptr_in_grad++;
-    din++;
-  }
-}
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -90,12 +90,6 @@ template <typename T>
 void act_thresholded_relu(
    const T* din, T* dout, int size, float threshold, int threads);

-#ifdef LITE_WITH_TRAIN
-template <typename T>
-void act_square_grad(
-    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/clip.cc
+++ b/lite/backends/arm/math/clip.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/clip.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output) {
+  float tmp;
+  for (int64_t i = 0; i < num; i++) {
+    tmp = *input;
+    tmp = tmp > min ? tmp : min;
+    *output = tmp < max ? tmp : max;
+    input++;
+    output++;
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/clip.h
+++ b/lite/backends/arm/math/clip.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/backends/arm/math/elementwise.h"
+#include <math.h>
 #include <algorithm>
 #include "lite/backends/arm/math/funcs.h"

@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_div<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    *dout = *dinx / *diny;
+    dout++;
+    dinx++;
+    diny++;
+  }
+}
+
 template <>
 void elementwise_div<float>(const float* dinx,
                            const float* diny,
@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* din_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int p = 0; p < num; p++) {
+        *dout_ptr = *din_ptr / diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_div_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -1541,6 +1576,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
  }
 }

+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const T* din_ptr = dinx + offset;
+      const T diny_data = diny[j];
+      T* dout_ptr = dout + offset;
+
+      int cnt = num >> 2;
+      int remain = num % 4;
+      for (int k = 0; k < cnt; ++k) {
+        register T dinx0 = din_ptr[0];
+        register T dinx1 = din_ptr[1];
+        register T dinx2 = din_ptr[2];
+        register T dinx3 = din_ptr[3];
+        dout_ptr[0] = dinx0 % diny_data;
+        dout_ptr[1] = dinx1 % diny_data;
+        dout_ptr[2] = dinx2 % diny_data;
+        dout_ptr[3] = dinx3 % diny_data;
+        din_ptr += 4;
+        dout_ptr += 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr++ = *din_ptr++ % diny_data;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
+  int cnt = num >> 2;
+  int remain = num % 4;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const T* dinx_ptr = dinx + (i << 2);
+    const T* diny_ptr = diny + (i << 2);
+    T* dout_ptr = dout + (i << 2);
+
+    register T dinx0 = dinx_ptr[0];
+    register T dinx1 = dinx_ptr[1];
+    register T dinx2 = dinx_ptr[2];
+    register T dinx3 = dinx_ptr[3];
+
+    register T diny0 = diny_ptr[0];
+    register T diny1 = diny_ptr[1];
+    register T diny2 = diny_ptr[2];
+    register T diny3 = diny_ptr[3];
+
+    dout_ptr[0] = dinx0 % diny0;
+    dout_ptr[1] = dinx1 % diny1;
+    dout_ptr[2] = dinx2 % diny2;
+    dout_ptr[3] = dinx3 % diny3;
+  }
+  if (remain > 0) {
+    const T* dinx_ptr = dinx + (cnt << 2);
+    const T* diny_ptr = diny + (cnt << 2);
+    T* dout_ptr = dout + (cnt << 2);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
+    }
+  }
+}
+
+template void elementwise_mod<int64_t>(const int64_t* dinx,
+                                       const int64_t* diny,
+                                       int64_t* dout,
+                                       int num);
+
+template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
+                                                 const int64_t* diny,
+                                                 int64_t* dout,
+                                                 int batch,
+                                                 int channels,
+                                                 int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -253,6 +253,13 @@ template <typename T>
 void elementwise_div_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);

+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -25,6 +25,7 @@
 #include "lite/backends/arm/math/axpy.h"
 #include "lite/backends/arm/math/beam_search.h"
 #include "lite/backends/arm/math/box_coder.h"
+#include "lite/backends/arm/math/clip.h"
 #include "lite/backends/arm/math/col_im_transform.h"
 #include "lite/backends/arm/math/concat.h"
 #include "lite/backends/arm/math/conv_block_utils.h"

--- a/lite/backends/arm/math/softmax.cc
+++ b/lite/backends/arm/math/softmax.cc
@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
    }
    float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
    float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
      max_data = std::max(max_data, din_max_ptr[0]);
      din_max_ptr++;
    }
@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
    float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
    float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);

-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
      dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
      sum_data += dout_sum_ptr[0];
      din_sum_ptr++;

--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -41,6 +41,8 @@
        << "CUDA: " << cudaGetErrorString(e);                \
  }

+#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
+
 #define CUBLAS_CALL(func)                                        \
  {                                                              \
    auto e = (func);                                             \
@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    case CUDNN_STATUS_VERSION_MISMATCH:
+      return "CUDNN_STATUS_VERSION_MISMATCH";
 #endif
  }
  return "Unknown cudnn status";

--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -13,6 +13,8 @@ nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})

 set (
 math_cuda
@@ -25,6 +27,8 @@ set (
 cudnn_pool
 cuda_gemm
 cuda_batched_gemm
+ cuda_strided_gemm
+ cuda_sequence_padding
 )

 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                              search_func);

  } else {
-    CUDNN_CHECK(
-        cudnnGetConvolutionForwardAlgorithm(this->handle_,
+    int requestedAlgoCount = 1;
+    int returnedAlgoCount;
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
                                                       this->input_desc_,
                                                       this->filter_desc_,
                                                       this->conv_desc_,
                                                       this->output_desc_,
-                                            this->preference_,
-                                            this->workspace_limit_bytes_,
-                                            &this->fwd_algo_));
+                                                       requestedAlgoCount,
+                                                       &returnedAlgoCount,
+                                                       &this->algo_perf_));
+    this->fwd_algo_ = this->algo_perf_.algo;
  }
  CUDNN_CHECK(
      cudnnGetConvolutionForwardWorkspaceSize(this->handle_,

--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
@@ -81,6 +81,7 @@ class CudnnConv2DBase {
  cudaStream_t stream_;
  cudnnHandle_t handle_;
  cudnnConvolutionFwdAlgo_t fwd_algo_;
+  cudnnConvolutionFwdAlgoPerf_t algo_perf_;
  cudnnTensorDescriptor_t input_desc_;
  cudnnTensorDescriptor_t output_desc_;
  cudnnTensorDescriptor_t bias_desc_;
@@ -98,8 +99,6 @@ class CudnnConv2DBase {

  const bool use_tensor_core_ = true;
  const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
-  const cudnnConvolutionFwdPreference_t preference_ =
-      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;

  // For int8
  Tensor temp_tensor_;

--- a/lite/backends/cuda/math/sequence_padding.cu
+++ b/lite/backends/cuda/math/sequence_padding.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+enum CopyType { kSeqToPad, kPadToSeq };
+
+template <typename T, CopyType Type>
+__global__ void SequencePadKernel(T* dst,
+                                  const T* src,
+                                  const T* pad_value,
+                                  bool is_constant_pad,
+                                  const size_t* seq_offsets,
+                                  const int seq_num,
+                                  const int pad_seq_len,
+                                  const int step_width) {
+  size_t seq_idx = blockIdx.y;
+  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+
+  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
+  size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width;
+  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
+  const T* src_data =
+      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
+
+  if (step_idx < seq_len) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
+    }
+  }
+}
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kSeqToPad><<<grid, threads, 0, *stream>>>(
+      pad_data,
+      seq_data,
+      pad_value_data,
+      is_constant_pad,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kPadToSeq><<<grid, threads, 0, *stream>>>(
+      seq_data,
+      pad_data,
+      nullptr,
+      false,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+template void SequencePadding(float* pad_data,
+                              const float* seq_data,
+                              const float* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequencePadding(half* pad_data,
+                              const half* seq_data,
+                              const half* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequenceUnpadding(float* seq_data,
+                                const float* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+template void SequenceUnpadding(half* seq_data,
+                                const half* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/sequence_padding.h
+++ b/lite/backends/cuda/math/sequence_padding.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream);
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/strided_gemm.cc
+++ b/lite/backends/cuda/math/strided_gemm.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/strided_gemm.h"
+
+#include <iostream>
+
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+bool StridedGemm<PtypeIn, PtypeOut>::init(const bool trans_a,
+                                          const bool trans_b,
+                                          Context<TARGET(kCUDA)>* ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool StridedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const float* a_data,
+                                    const float* b_data,
+                                    float* c_data,
+                                    const int batch_size,
+                                    const int64_t stride_a,
+                                    const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_32F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_32F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_32F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_32F,
+                                         algo_));
+  return true;
+}
+
+template <>
+bool StridedGemm<half, half>::run(const half alpha,
+                                  const half beta,
+                                  const int m,
+                                  const int n,
+                                  const int k,
+                                  const half* a_data,
+                                  const half* b_data,
+                                  half* c_data,
+                                  const int batch_size,
+                                  const int64_t stride_a,
+                                  const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_16F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_16F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_16F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_16F,
+                                         algo_));
+  return true;
+}
+
+template class StridedGemm<float, float>;
+template class StridedGemm<half, half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/strided_gemm.h
+++ b/lite/backends/cuda/math/strided_gemm.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class StridedGemm {
+ public:
+  StridedGemm() : cu_handle_(nullptr) {}
+  ~StridedGemm() {}
+
+  bool init(const bool trans_a,
+            const bool trans_b,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeIn alpha,
+           const PtypeIn beta,
+           const int m,
+           const int n,
+           const int k,
+           const PtypeIn* a_data,
+           const PtypeIn* b_data,
+           PtypeOut* c_data,
+           const int batch_size,
+           const int64_t stride_a,
+           const int64_t stride_b);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
  TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
 }

-// template <typename T>
-// void Transpose<T>::transpose(T* dst,
-//                             const T* src,
-//                             const std::vector<int>& src_dims,
-//                             const std::vector<int>& axes,
-//                             cudaStream_t* stream) {
-//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
-//  std::transform(
-//      src_dims.begin(),
-//      src_dims.end(),
-//      _src_dims.begin(),
-//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
-//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
-//  stream);
-//}
-
 template class Transpose<int8_t>;
 template class Transpose<float>;
+template class Transpose<half>;

 }  // namespace math
 }  // namespace cuda

--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -15,6 +15,7 @@
 #include "lite/backends/mlu/target_wrapper.h"

 #include <memory>
+#include <utility>

 #include "lite/backends/mlu/mlu_utils.h"

@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {

 }  // namespace mlu

+thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
+thread_local int TargetWrapperMlu::mlu_core_number_{1};
+thread_local bool TargetWrapperMlu::use_first_conv_{false};
+thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
+thread_local std::vector<float> TargetWrapperMlu::std_vec_;
+thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
+
 size_t TargetWrapperMlu::num_devices() {
  uint32_t dev_count = 0;
  CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
      LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
  }
 }
+void TargetWrapperMlu::SetMLURunMode(
+    lite_api::MLUCoreVersion core_version,
+    int core_number,
+    DataLayoutType input_layout,
+    std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  mean_vec_ = firstconv_param.first;
+  std_vec_ = firstconv_param.second;
+  use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
+  return mlu_core_version_;
+}
+
+int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
+
+bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
+
+const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }

-// void TargetWrapperMlu::MemcpyAsync(void* dst,
-//                                    const void* src,
-//                                    size_t size,
-//                                    IoDirection dir,
-//                                    const stream_t& stream) {
-//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
-//   MemcpySync(dst, src, size, dir);
-// }
+DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }

 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
@@ -13,6 +13,8 @@
 // limitations under the License.

 #pragma once
+#include <utility>
+#include <vector>
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/target_wrapper.h"

@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
                         const void* src,
                         size_t size,
                         IoDirection dir);
-  // static void MemcpyAsync(void* dst,
-  //                         const void* src,
-  //                         size_t size,
-  //                         IoDirection dir,
-  //                         const queue_t& queue);
+  static void SetMLURunMode(
+      lite_api::MLUCoreVersion core_version,
+      int core_number,
+      DataLayoutType input_layout,
+      std::pair<std::vector<float>, std::vector<float>> firstconv_param);
+  static cnmlCoreVersion_t MLUCoreVersion();
+  static int MLUCoreNumber();
+  static bool UseFirstConv();
+  static const std::vector<float>& MeanVec();
+  static const std::vector<float>& StdVec();
+  static DataLayoutType InputLayout();
+
+ private:
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 };

 }  // namespace lite

--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -20,94 +20,120 @@ namespace paddle {
 namespace lite {
 namespace npu {

-bool WriteToOMFile(const domi::ModelBufferData& om_model_buff,
-                   std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "wb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  uint32_t write_size =
-      (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
-  CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
-
-  fclose(fp);
-  return true;
-}
-
-bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
-                    std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "rb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  fseek(fp, 0, SEEK_END);
-  uint32_t model_length = (uint32_t)ftell(fp);
-  fseek(fp, 0, SEEK_SET);
-  om_model_buff->data = malloc(model_length);
-  om_model_buff->length = model_length;
-  uint32_t read_size =
-      (uint32_t)fread(om_model_buff->data, 1, model_length, fp);
-  CHECK_EQ(read_size, model_length) << "read om file failed !";
-
-  fclose(fp);
-  return true;
+std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
+    const std::string& model_name,
+    std::vector<char>* model_buffer,
+    bool* model_comp) {
+  // Create a HiAI model manager client to load the HiAI om model
+  auto model_client = std::make_shared<hiai::AiModelMngerClient>();
+  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] Init hiai model client failed!";
+    return nullptr;
+  }
+  // Check HiAI DDK version
+  const char* ddk_version = model_client->GetVersion();
+  if (ddk_version) {
+    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+  } else {
+    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
+  }
+  // Check model compatibility
+  auto model_desc = std::make_shared<hiai::AiModelDescription>(
+      model_name, freq_level(), framework_type(), model_type(), device_type());
+  model_desc->SetModelBuffer(
+      reinterpret_cast<const void*>(model_buffer->data()),
+      model_buffer->size());
+  if (!*model_comp &&
+      model_client->CheckModelCompatibility(*model_desc, *model_comp) !=
+          hiai::AI_SUCCESS) {
+    *model_comp = false;
+    VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to "
+            << *model_comp;
+  } else {
+    *model_comp = true;
+    VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to "
+            << *model_comp;
+  }
+  // Rebuild and write the data of the compatible model to the model buffer
+  if (!*model_comp) {
+    std::shared_ptr<hiai::AiModelBuilder> model_builder =
+        std::make_shared<hiai::AiModelBuilder>(model_client);
+    hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate(
+        reinterpret_cast<void*>(model_buffer->data()), model_buffer->size());
+    if (org_model_buffer) {
+      std::vector<hiai::MemBuffer*> org_model_buffers;
+      org_model_buffers.push_back(org_model_buffer);
+      hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate(
+          framework_type(), org_model_buffers);
+      // VLOG(3) << "[NPU] new model buffer memeory size is " <<
+      // new_model_buffer->GetMemBufferSize();
+      if (new_model_buffer) {
+        uint32_t new_model_size = 0;
+        if (model_builder->BuildModel(org_model_buffers,
+                                      new_model_buffer,
+                                      new_model_size) == hiai::AI_SUCCESS) {
+          // need to change to new_model_size as GetMemBufferSize is not
+          // correct.
+          model_buffer->resize(new_model_size);
+          memcpy(reinterpret_cast<void*>(model_buffer->data()),
+                 new_model_buffer->GetMemBufferData(),
+                 new_model_size);
+          // Reset the model buffer
+          model_desc->SetModelBuffer(
+              reinterpret_cast<const void*>(model_buffer->data()),
+              model_buffer->size());
+          VLOG(3) << "[NPU] Rebuild the compatible model done.";
+        } else {
+          LOG(WARNING) << "[NPU] Rebuild the compatible model failed!";
+        }
+        model_builder->MemBufferDestroy(new_model_buffer);
+      } else {
+        LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!";
+      }
+      model_builder->MemBufferDestroy(org_model_buffer);
+    } else {
+      LOG(WARNING) << "[NPU] InputMemBufferCreate failed!";
+    }
+  }
+  // Load the compatible model
+  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs{
+      model_desc};
+  if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
+    return nullptr;
+  }
+  VLOG(3) << "[NPU] Load model done.";
+  return model_client;
 }

-std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
-    const std::string model_name,                // NOLINT
-    std::vector<ge::Operator>& input_nodes,      // NOLINT
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
                   std::vector<ge::Operator>& output_nodes,  // NOLINT
-    const std::string model_cache_full_dir = ""  // NOLINT
-    ) {
-  VLOG(3) << "[NPU] Build model";
-  // Build the HiAI IR graph to the HiAI om model
+                   std::vector<char>* model_buffer) {
+  // Convert the HiAI IR graph to the HiAI om model
  ge::Graph ir_graph("graph");
  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
  ge::Model om_model("model", "model");
  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;

-  if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
-    VLOG(3) << "Will read om model from " << model_cache_full_dir;
-    ReadFromOMFile(&om_model_buf, model_cache_full_dir);
-  } else {
-    if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
+  // Build the HiAI om model, serialize and output it to the om buffer
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_buffer;
+  if (!ir_build.CreateModelBuff(om_model, om_buffer)) {
    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-      return nullptr;
+    return false;
  }
-    if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
+  if (!ir_build.BuildIRModel(om_model, om_buffer)) {
    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-      ir_build.ReleaseModelBuff(om_model_buf);
-      return nullptr;
+    ir_build.ReleaseModelBuff(om_buffer);
+    return false;
  }
-    if (!model_cache_full_dir.empty()) {
-      VLOG(3) << "Will write om model to " << model_cache_full_dir;
-      WriteToOMFile(om_model_buf, model_cache_full_dir);
-    }
-  }
-
-  // Create a HiAI model manager client to load the HiAI om model
-  std::shared_ptr<hiai::AiModelMngerClient> model_client(
-      new hiai::AiModelMngerClient());
-  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
-    ir_build.ReleaseModelBuff(om_model_buf);
-    return nullptr;
-  }
-  auto model_desc = std::make_shared<hiai::AiModelDescription>(
-      model_name, freq_level(), framework_type(), model_type(), device_type());
-  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
-  if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    ir_build.ReleaseModelBuff(om_model_buf);
-    return nullptr;
-  }
-  ir_build.ReleaseModelBuff(om_model_buf);
-  VLOG(3) << "[NPU] Build done";
-  return model_client;
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data),
+         om_buffer.length);
+  ir_build.ReleaseModelBuff(om_buffer);
+  VLOG(3) << "[NPU] Build model done.";
+  return true;
 }

 }  // namespace npu

--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -38,14 +38,18 @@ class Device {
  int model_type() { return model_type_; }
  int device_type() { return device_type_; }

+  // Load the HiAI om model from buffer, rebuild the model if it's incompatible
+  // with the current device, then create a HiAI model manager client(from HiAI
+  // Server) to run inference
+  std::shared_ptr<hiai::AiModelMngerClient> Load(
+      const std::string& model_name,
+      std::vector<char>* model_buffer,
+      bool* model_comp);
  // Build the HiAI IR graph to om model, return HiAI model manager client to
  // load om model and run inference.
-  std::shared_ptr<hiai::AiModelMngerClient> Build(
-      const std::string model_name,             // NOLINT
-      std::vector<ge::Operator>& input_nodes,   // NOLINT
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
             std::vector<ge::Operator>& output_nodes,  // NOLINT
-      const std::string model_cache_name        // NOLINT
-      );                                        // NOLINT
+             std::vector<char>* model_buffer);

 private:
  int freq_level_{3};

--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
  }
 }

-cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
                                         size_t max_work_size,
                                         int divisor) {
  int preferred_lws = 0;
@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                     static_cast<size_t>(gws0)};
 #endif
 }
-cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                                size_t max_work_size,
                                                int divisor) {
  int preferred_lws = 0;

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -62,10 +62,10 @@ class CLContext {

  cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);

-  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
                                size_t max_work_size,
                                int divitor = 2);
-  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                       size_t max_work_size,
                                       int divitor = 2);
  bool IsArmMali();

--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
    } else {
        for (int cidx = col; cidx < N; ++cidx) {
            for (int ridx = row; ridx < M; ++ridx) {
-                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0 = 0;
+                CL_COMPUTE_DTYPE b0 = 0;
+                CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
                for (int p = 0; p < K; ++p) {
                    a0 = *(a + ridx * K + p);
                    b0 = *(b + p * N + cidx),

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
                         __write_only image2d_t output_image,
                         __private const int stride,
                         __private const int offset,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
                           __private const int global_size_dim2,
                           __read_only image2d_t input,
                           __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                           __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                           __read_only image2d_t new_scale,
                           __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
    __private const int global_size_dim2,
    __read_only image2d_t input,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
    __write_only image2d_t output_image,
    __private const int stride,
    __private const int offset,
@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                 __private const int ou_nh,
                                 __read_only image2d_t input,
                                 __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                 __read_only image2d_t bias,
-#endif
                                 __write_only image2d_t output_image,
                                 __private const int stride,
                                 __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void transpose_4d(__read_only image2d_t input_image,
+                           __write_only image2d_t output_image,
+                           __private const int out_C,
+                           __private const int out_H,
+                           __private const int out_W,
+                           __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  const int in_n = out_n;
+  const int in_c = out_w * 0.25;
+  const int in_h0 = out_c0;
+  const int in_h1 = out_c1;
+  const int in_h2 = out_c2;
+  const int in_h3 = out_c3;
+  const int in_w = out_h;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = in_W * in_c + in_w;
+  input_pos0.y = in_n * in_h0;
+
+  input_pos1.x = in_W * in_c + in_w;
+  input_pos1.y = in_n * in_h1;
+
+  input_pos2.x = in_W * in_c + in_w;
+  input_pos2.y = in_n * in_h2;
+
+  input_pos3.x = in_W * in_c + in_w;
+  input_pos3.y = in_n * in_h3;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+
+  if (out_w % 4 == 0) {
+    output.x = input0.x;
+  } else if (out_w % 4 == 1) {
+    output.x = input0.y;
+  } else if (out_w % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if(out_w % 4 == 0) {
+      output.y = input1.x;
+    } else if(out_w % 4 == 1) {
+      output.y = input1.y;
+    } else if(out_w % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+    if (out_w % 4 == 0){
+      output.z = input2.x;
+    } else if (out_w % 4 == 1) {
+      output.z = input2.y;
+    } else if (out_w % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (out_w % 4 == 0) {
+      output.w = input3.x;
+    } else if (out_w % 4 == 1) {
+      output.w = input3.y;
+    } else if (out_w % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+__kernel void transpose(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_C,
+                        __private const int out_H,
+                        __private const int out_W,
+                        __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  
+  const int in_n = 1;
+  const int in_c = out_c;
+  const int in_w = out_h;
+  const int in_h = out_w;
+  
+  int2 input_pos;
+  int2 output_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_n * in_h;
+  
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_n * out_h;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 output;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
+
+  output = input;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
\ No newline at end of file
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/backends/x86/cupti_lib_path.h"
 #include "lite/backends/x86/port.h"
 #include "lite/backends/x86/warpctc_lib_path.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"

 // DEFINE_string(cudnn_dir,
 //               "",
@@ -178,7 +178,7 @@ auto error_msg =
 #endif  // !_WIN32
  if (throw_on_error) {
    CHECK(dso_handle != nullptr);
-    // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
+    // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
  } else if (nullptr == dso_handle) {
    // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
  }

--- a/lite/backends/x86/jit/benchmark.cc
+++ b/lite/backends/x86/jit/benchmark.cc
@@ -319,8 +319,8 @@ void BenchKernelSgd() {
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](
      int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);

--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    CHECK_GT(attr.table_height, 0);
+    CHECK_GT(attr.table_width, 0);
+    CHECK_GT(attr.index_height, 0);
+    CHECK_GT(attr.index_width, 0);
+    CHECK_GT(attr.out_width, 0);
    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
  }
 };

--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -27,7 +27,7 @@ void MatMulJitCode::genCode() {
  preCode();
  int block, rest;
  const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  CHECK_GT(groups.front(), 0);

  const int block_len = sizeof(float) * block;
  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    CHECK_GT(attr.m, 0);
+    CHECK_GT(attr.n, 0);
+    CHECK_GT(attr.k, 0);
    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
  }
 };

--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"

 namespace paddle {
 namespace lite {
@@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode {
                         size_t code_size = 256 * 1024,
                         void* code_ptr = nullptr)
      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    CHECK_EQ(m_, 1) << "Only support m==1 yet";
    this->genCode();
  }


--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    CHECK_GT(attr.w, 0);
+    CHECK_GT(attr.h, 0);
    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
  }
 };

--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"

 namespace paddle {
 namespace lite {
@@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode {
        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
        reg_idx++;
      }
-      PADDLE_ENFORCE_EQ(
-          reg_idx, rest_used_num_regs, "All heights should use same regs");
+      CHECK_EQ(reg_idx, rest_used_num_regs)
+          << "All heights should use same regs";
      for (int i = 0; i < reg_idx; ++i) {
        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
      }

--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    CHECK_EQ(attr.param_width, attr.grad_width);
+    CHECK_LE(attr.selected_rows_size, attr.grad_height);
+    CHECK_GE(attr.selected_rows_size, 0);
    return make_unique<SgdJitCode>(attr, CodeSize(attr));
  }
 };

--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
  }
  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    CHECK_GT(w, 0);
    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
  }
 };

--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -21,8 +21,8 @@
 // posix_memalign
 #include "lite/backends/x86/cpu_info.h"
 #include "lite/backends/x86/jit/macro.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"

 #ifndef _WIN32
 #define posix_memalign_free free
@@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) {
 #ifdef _WIN32
  ptr = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
-                    0,
-                    "GenBase Alloc %ld error!",
-                    size);
+  CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
+                                                     << " error!";
 #endif
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
  return ptr;
 }


--- a/lite/backends/x86/jit/helper.cc
+++ b/lite/backends/x86/jit/helper.cc
@@ -14,9 +14,10 @@

 #include "lite/backends/x86/jit/helper.h"
 #include <algorithm>  // tolower
+#include <cstring>
 #include <numeric>
 #include <string>
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -104,12 +105,12 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
  int block, rest;
  const auto groups = packed_groups(n, k, &block, &rest);
  std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
  });
  int sum = std::accumulate(groups.begin(), groups.end(), 0);
  std::memset(dst, 0, k * sum * block * sizeof(float));
-  PADDLE_ENFORCE_GE(
-      sum * block, n, "The packed n should be equal to or larger than n");
+  CHECK_GE(sum * block, n)
+      << "The packed n should be equal to or larger than n";

  const int block_len = sizeof(float) * block;
  int n_offset = 0;

--- a/lite/backends/x86/jit/helper.h
+++ b/lite/backends/x86/jit/helper.h
@@ -23,7 +23,7 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/kernel_key.h"
 #include "lite/backends/x86/jit/kernel_pool.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() {
  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
  KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
  auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  CHECK(ref_iter != ref_pool.end())
+      << "Every Kernel should have reference function.";
  auto& ref_impls = ref_iter->second;
  for (auto& impl : ref_impls) {
    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -94,7 +94,7 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
  auto ker = GetReferKernel<KernelTuple>();
  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  CHECK(p) << "The Refer kernel should exsit";
  return p->GetFunc();
 }

@@ -125,7 +125,7 @@ std::vector<const Kernel*> GetAllCandidateKernels(

  // The last implementation should be reference function on CPUPlace.
  auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
  res.emplace_back(ref);
  return res;
 }
@@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
    std::string name = k->ImplType();
    if (name == "JitCode") {
      auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      CHECK(i) << "jitcode kernel cast can not fail.";
      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
    } else {
      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      CHECK(i) << "kernel cast can not fail.";
      res.emplace_back(std::make_pair(name, i->GetFunc()));
    }
  }
@@ -166,7 +166,7 @@ template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
    const typename KernelTuple::attr_type& attr) {
  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  CHECK_GE(funcs.size(), 1UL);
  // Here could do some runtime benchmark of this attr and return the best one.
  // But yet just get the first one as the default best one,
  // which is searched in order and tuned by offline.

--- a/lite/backends/x86/jit/kernel_key.cc
+++ b/lite/backends/x86/jit/kernel_key.cc
@@ -14,7 +14,7 @@

 #include "lite/backends/x86/jit/kernel_key.h"
 #include <xxhash.h>  // XXH64: 13.8 GB/s
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -18,7 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -104,11 +104,11 @@ void EmbSeqPool(const T* table,
                const int64_t* idx,
                T* out,
                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
  };

  for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -175,22 +175,22 @@ void Sgd(const T* lr,
         const int64_t* rows,
         T* out,
         const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
  T scalar = -lr[0];
  int width = attr->grad_width;
  if (out == param) {
    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
    }
  } else {
    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
      VScal(&scalar, grad + i * width, out + h_idx * width, width);
      VAdd(param + h_idx * width,
           out + h_idx * width,

--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
@@ -22,7 +22,6 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/macro.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"

 namespace paddle {
 namespace lite {
@@ -480,12 +479,12 @@ void EmbSeqPool(const T* table,
                const int64_t* idx,
                T* out,
                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);

  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
  };

  for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -527,12 +526,12 @@ void Sgd(const T* lr,
         const int64_t* rows,
         T* out,
         const lite::jit::sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
    auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    CHECK_LT(h_idx, attr->param_height);
+    CHECK_GE(h_idx, 0);
    for (int64_t j = 0; j < attr->grad_width; ++j) {
      out[h_idx * attr->grad_width + j] =
          param[h_idx * attr->grad_width + j] -

--- a/lite/backends/x86/jit/test.cc
+++ b/lite/backends/x86/jit/test.cc
@@ -910,8 +910,8 @@ void TestKernelSgd() {
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](
      int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -116,7 +116,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    lod[0].assign(high_level.begin(), high_level.end());
    lod[1].assign(low_level.begin(), low_level.end());
    // if (!lite::fluid::CheckLoD(lod)) {
-    //  //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    //  //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod));
    //}
    selected_ids->set_lod(lod);
    selected_scores->set_lod(lod);

--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                     int num_flatten_cols,
                                     bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
+  CHECK_GT(tensor_dim.size(), 1u);
  MatDescriptor retv;
  if (num_flatten_cols > 1) {
    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);

--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -287,22 +287,22 @@ struct CBlas<double> {

 template <>
 struct CBlas<lite::fluid::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
  static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    LOG(FATAL) << "float16 SMM_GEMM not supported on CPU";
  }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
+  static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+    LOG(FATAL) << "float16 VSQUARE not supported on CPU";
  }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
+  static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
+  static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
+  static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU";
  }
 #endif
 };
@@ -461,11 +461,11 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
  auto dim_a = mat_a.dims();
  auto dim_b = mat_b.dims();
  auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  // PADDLE_ENFORCE(
-  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
-  //    "The targets of matrices must be same");
+  CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
+      << "The input and output of matmul be matrix";
+  // CHECK(
+  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
+  //    << "The targets of matrices must be same";

  int M = dim_out[0];
  int N = dim_out[1];
@@ -746,7 +746,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                          T alpha,
                          lite::Tensor *mat_out,
                          T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CHECK_EQ(dim_a.width_, dim_b.height_);
  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -761,8 +761,8 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           beta,
                           mat_out->template mutable_data<T>());
  } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+          dim_b.batch_size_ == 0);
    this->template BatchedGEMM<T>(
        transA,
        transB,

--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -146,7 +146,7 @@ class ContextProjectFunctor {
      }
    }
    if (padding_trainable) {
-      PADDLE_ENFORCE(padding_data != nullptr);
+      CHECK(padding_data != nullptr);
      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
        if (lod_level_0[i] == lod_level_0[i + 1]) continue;


--- a/lite/backends/x86/math/cpu_vec.h
+++ b/lite/backends/x86/math/cpu_vec.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 #ifdef PADDLE_WITH_MKLML
 #include "lite/backends/x86/mklml.h"
@@ -652,7 +652,7 @@ class VecActivations {
    } else if (type == "identity" || type == "") {
      return vec_identity<T, isa>;
    }
-    PADDLE_THROW("Not support type: %s", type);
+    LOG(FATAL) << "Not support type: " << type;
  }
 };


--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -57,7 +57,7 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
      for (int i = 0; i < batch_size; ++i) {
        for (int j = 0; j < num_remain; j++) {
          int lbl = label_data[i * num_remain + j];
-          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
+          CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
          int index = i * num_classes + lbl * num_remain + j;
          int loss_idx = i * num_remain + j;
          loss_data[loss_idx] =

--- a/lite/backends/x86/math/cross_entropy.h
+++ b/lite/backends/x86/math/cross_entropy.h
@@ -27,7 +27,7 @@ namespace math {
 template <typename T>
 struct TolerableValue {
  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
+    CHECK(static_cast<bool>(std::is_floating_point<T>::value));
    const T kApproInf = 1e20;

    if (x == INFINITY) return kApproInf;

--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) {
    return ActivationType::kIdentity;
  }
  LOG(ERROR) << "Not support type " << type;
-  // PADDLE_ENFORCE(false, "Not support type %s", type);
-  // PADDLE_THROW("Not support type %s.", type);
  return ActivationType();
 }


--- a/lite/backends/x86/math/gru_compute.h
+++ b/lite/backends/x86/math/gru_compute.h
@@ -13,7 +13,7 @@ limitations under the License. */

 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "lite/backends/x86/math/im2col.h"
 #include <vector>
 #include "lite/backends/x86/math/im2col_cfo_cpu.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -38,8 +38,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding,
                  lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);

    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
        dilation[1] == 1) {
@@ -72,8 +72,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding,
                  lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
    int im_channels = im->dims()[0];
    int im_height = im->dims()[1];
    int im_width = im->dims()[2];
@@ -82,20 +82,20 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
    int col_height = col.dims()[3];
    int col_width = col.dims()[4];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+    CHECK_EQ((im_height + padding[0] + padding[2] -
              ((dilation[0] * (filter_height - 1) + 1))) /
                     stride[0] +
                 1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+             col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ((im_width + padding[1] + padding[3] -
              ((dilation[1] * (filter_width - 1) + 1))) /
                     stride[1] +
                 1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+             col_width)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";

    int channels_col = im_channels * filter_height * filter_width;

@@ -150,8 +150,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding,
                  lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
    int im_channels = im.dims()[0];
    int im_height = im.dims()[1];
    int im_width = im.dims()[2];
@@ -214,8 +214,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding,
                  lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
    int im_channels = im->dims()[0];
    int im_height = im->dims()[1];
    int im_width = im->dims()[2];
@@ -224,16 +224,16 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
    int col_height = col.dims()[0];
    int col_width = col.dims()[1];

-    PADDLE_ENFORCE_EQ(
+    CHECK_EQ(
        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
+        col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ(
        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        col_width)
+        << "col_width and padding(padding_left, padding_right) are "
+           "inconsistent.";

    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();

--- a/lite/backends/x86/math/lstm_compute.h
+++ b/lite/backends/x86/math/lstm_compute.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -121,8 +121,8 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                  lite::Tensor* output) {
    const auto& in_dims = input.dims();
    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    CHECK_EQ(vector.numel(), size);
+    CHECK_EQ(output->dims(), in_dims);

    const T* input_data = input.data<T>();
    const T* vector_data = vector.data<T>();

--- a/lite/backends/x86/math/math_function.h
+++ b/lite/backends/x86/math/math_function.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-//#include "lite/tensor_util.h"
+#include "lite/utils/cp_logging.h"
+// #include "lite/tensor_util.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -59,7 +59,7 @@ void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                       lite::TensorLite* out) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  CHECK_EQ(out->numel(), size);

  auto in = lite::fluid::EigenMatrix<T>::From(input);
  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -81,7 +81,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
    auto& in_dims = input.dims();
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    CHECK_EQ(out->numel(), size);

    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
@@ -103,8 +103,8 @@ void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
                                        const lite::TensorLite& input,
                                        lite::TensorLite* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);

  auto in = lite::fluid::EigenMatrix<T>::From(input);
  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -124,10 +124,10 @@ class RowwiseMean<lite::TargetType::kX86, T> {
                  const lite::TensorLite& input,
                  lite::TensorLite* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);
    auto inv_size = 1.0 / size;
    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
@@ -147,8 +147,8 @@ void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                       const lite::TensorLite& input,
                                       lite::TensorLite* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);

  auto in = lite::fluid::EigenMatrix<T>::From(input);
  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -168,10 +168,10 @@ class RowwiseSum<lite::TargetType::kX86, T> {
                  const lite::TensorLite& input,
                  lite::TensorLite* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);

    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();

--- a/lite/backends/x86/math/math_function_test.cc
+++ b/lite/backends/x86/math/math_function_test.cc
@@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) {
  auto* ctx = new paddle::platform::CPUDeviceContext();
  paddle::operators::math::set_constant(*ctx, &t, 10);
  for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    CHECK_EQ(10, t.data<int>()[i]);
  }
  delete ctx;
 }

--- a/lite/backends/x86/math/sampler.h
+++ b/lite/backends/x86/math/sampler.h
@@ -32,7 +32,7 @@ namespace math {
 class Sampler {
 public:
  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    //    CHECK_GT(range, 0, "Range should be greater than 0.");
    if (seed == 0) {
      std::random_device r;
      seed_ = r();

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
                  const fluid::SelectedRows& input2,
                  fluid::SelectedRows* output) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    CHECK_EQ(in1_height, input2.height());
    output->set_height(in1_height);

    auto& in1_rows = input1.rows();
@@ -49,8 +49,8 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
    auto& in2_value = input2.value();

    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());

    auto* out_data = out_value->template mutable_data<T>();
    auto* in1_data = in1_value.data<T>();
@@ -73,15 +73,15 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
    auto in1_height = input1.height();
    auto in2_dims = input2.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, out_dims[0]);

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
+    CHECK_EQ(in1_row_numel, output->numel() / in1_height);

    SetConstant<lite::TargetType::kX86, T> functor;
    functor(context, output, 0.0);
@@ -113,7 +113,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
                  const int64_t input2_offset,
                  fluid::SelectedRows* input2) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    CHECK_EQ(in1_height, input2->height());

    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
@@ -149,7 +149,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
      auto& in_rows = (*iter)->rows();
      size += in_rows.end() - in_rows.begin();
      auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      CHECK_EQ(in1_height, input2->height());
    }
    // concat rows
    std::vector<int64_t> in2_rows;
@@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {

    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);

    auto* in1_data = in1_value.data<T>();
    auto* input2_data = input2->template mutable_data<T>();
@@ -291,12 +291,11 @@ struct MergeAdd<lite::TargetType::kX86, T> {
      if (input->rows().size() == 0) {
        continue;
      }
-      PADDLE_ENFORCE_EQ(input_width,
-                        input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(
-          input_height, input->height(), "all input should have same height");
+      CHECK_EQ(input_width, input->value().dims()[1])
+          << "all input should have same "
+             "dimension except for the first one";
+      CHECK_EQ(input_height, input->height())
+          << "all input should have same height";
      row_num += input->rows().size();
      merged_row_set.insert(input->rows().begin(), input->rows().end());
    }
@@ -376,13 +375,13 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
                  lite::Tensor* input2) {
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);

    auto* in1_data = in1_value.data<T>();
    auto* input2_data = input2->template data<T>();

--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
    const uint64_t* index = index_lod.data();
    const auto& src_dims = src.dims();
    const auto& dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(
-        src_dims.size(), 2UL, "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1], "The width of src and dst must be same.");
+    CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
+    CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
+    CHECK_EQ(src_dims[1], dst_dims[1])
+        << "The width of src and dst must be same.";
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();

--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor {
                  bool is_reverse = false) const {
    if (!is_cal_batch_lod) {
      auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(),
-                        2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
-      PADDLE_ENFORCE_EQ(
-          lods[1].size(),
-          static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+      CHECK_GT(lods.size(), 2UL)
+          << "The LoD of LoDTensor should inlcude at least 2-level "
+             "sequence information.";
+      CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
+          << "The LoD information should be consistent with the dims.";
      CopyMatrixRowsFunctor<Target, T> to_batch;
      to_batch(context, lod_tensor, lods[1], batch, true);
      return;
    }

    auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";

    const auto& lod = lods[0];

@@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor {
                  const lite::Tensor& batch,
                  lite::Tensor* lod_tensor) const {
    auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(),
-                      2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
-    PADDLE_ENFORCE_EQ(
-        in_lod[1].size(),
-        static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+    CHECK_GT(in_lod.size(), 2UL)
+        << "The LoD of LoDTensor should inlcude at least 2-level "
+           "sequence information.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
    CopyMatrixRowsFunctor<Target, T> to_seq;
    to_seq(context, batch, in_lod[1], lod_tensor, false);
  }

--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor,
      layout == kBatchLengthWidth ? step_width : seq_num * step_width;
  for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
    int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+    CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can "
+                                            "not be less than its original "
+                                            "length.";
    int seq_data_offset = seq_offsets[seq_idx] * step_width;
    int pad_data_offset = layout == kBatchLengthWidth
                              ? seq_idx * pad_seq_len * step_width
@@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
              pad_seq_len,
              step_width,
              layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width)
+        << "The numel of 'pad_value' can only be 1 or be equal to the "
+           "'step_width'.";

    // fill padding value
    T* pad_data = pad_tensor->template mutable_data<T>();

--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/lod.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"

 namespace paddle {
 namespace lite {
@@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                             int64_t padded_seq_len,
                             int64_t step_width,
                             const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
-                    seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  CHECK_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back())
+      << "Value of 1st dimension of the sequence tensor should be "
+         "equal to sum of lengths of all sequences.";

-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+        seq_tensor_dims.size() == pad_tensor_dims.size())
+      << "pad_tensor's rank should be 1 greater than seq_tensor's "
+         "rank, or be equal with it.";
 }

 /*

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,12 +46,12 @@ class MaxSeqPoolFunctor {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
    for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+    CHECK_EQ(idx_dims, out_dims);

    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
@@ -95,10 +95,10 @@ class MaxSeqPoolFunctor<T, true> {
                  lite::Tensor* index) {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
    for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
    }

    auto starts = input.lod()[0];
@@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor {
    auto og_dims = out_grad.dims();
    auto ig_dims = in_grad->dims();
    auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    CHECK_GT(og_dims.size(), 1);
+    CHECK_GT(ig_dims.size(), 1);
    for (size_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+      CHECK_EQ(og_dims[i], ig_dims[i]);
    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+    CHECK_EQ(idx_dims, og_dims);

    const T* og_data = out_grad.data<T>();
    const int* max_index = index.data<int>();
@@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor {
    auto lod = in_grad->lod()[0];
    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE(in_w == out_w);
+    CHECK(in_w == out_w);
    const T* out_g_data = out_grad.data<T>();
    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
    auto blas = math::GetBlas<TARGET(kX86), T>(context);
@@ -330,7 +330,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
        out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                                     std::sqrt(static_cast<T>(h));
      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
      }
    }
  }
@@ -389,7 +389,7 @@ class SequencePoolGradFunctor<TARGET(kX86), T> {
      } else if (pooltype == "FIRST") {
        in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
      }
    }
  }

--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
--- a/lite/backends/xpu/debug.h
+++ b/lite/backends/xpu/debug.h
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
--- a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ b/lite/core/mir/fusion/conv_bn_fuser.h
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
--- a/lite/fluid/data_type.h
+++ b/lite/fluid/data_type.h
--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
--- a/lite/fluid/rw_lock.h
+++ b/lite/fluid/rw_lock.h
--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
--- a/lite/gen_code/gen_code_test.cc
+++ b/lite/gen_code/gen_code_test.cc
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/affine_grid_compute.cc
+++ b/lite/kernels/arm/affine_grid_compute.cc
--- a/lite/kernels/arm/affine_grid_compute.h
+++ b/lite/kernels/arm/affine_grid_compute.h
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
--- a/lite/kernels/arm/axpy_compute_test.cc
+++ b/lite/kernels/arm/axpy_compute_test.cc
--- a/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/lite/kernels/arm/batch_norm_compute_test.cc
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
--- a/lite/kernels/arm/activation_grad_compute.cc
+++ b/lite/kernels/arm/activation_grad_compute.cc
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/activation_grad_compute.h
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
--- a/lite/kernels/arm/decode_bboxes_compute_test.cc
+++ b/lite/kernels/arm/decode_bboxes_compute_test.cc
--- a/lite/kernels/arm/deformable_conv_compute.h
+++ b/lite/kernels/arm/deformable_conv_compute.h
--- a/lite/kernels/arm/dropout_compute_test.cc
+++ b/lite/kernels/arm/dropout_compute_test.cc
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/layer_norm_compute_test.cc
+++ b/lite/kernels/arm/layer_norm_compute_test.cc
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
--- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/mul_compute_test.cc
+++ b/lite/kernels/arm/mul_compute_test.cc
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
--- a/lite/kernels/arm/softmax_compute.cc
+++ b/lite/kernels/arm/softmax_compute.cc
--- a/lite/kernels/arm/softmax_compute_test.cc
+++ b/lite/kernels/arm/softmax_compute_test.cc
--- a/lite/kernels/arm/split_compute_test.cc
+++ b/lite/kernels/arm/split_compute_test.cc
--- a/lite/kernels/arm/split_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/transpose_compute_test.cc
+++ b/lite/kernels/arm/transpose_compute_test.cc
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/assign_value_compute.cu
+++ b/lite/kernels/cuda/assign_value_compute.cu
--- a/lite/kernels/cuda/assign_value_compute.h
+++ b/lite/kernels/cuda/assign_value_compute.h
--- a/lite/kernels/cuda/assign_value_compute_test.cc
+++ b/lite/kernels/cuda/assign_value_compute_test.cc
--- a/lite/kernels/cuda/fc_compute.cu
+++ b/lite/kernels/cuda/fc_compute.cu
--- a/lite/kernels/cuda/fc_compute.h
+++ b/lite/kernels/cuda/fc_compute.h
--- a/lite/kernels/cuda/fc_compute_test.cc
+++ b/lite/kernels/cuda/fc_compute_test.cc
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
--- a/lite/kernels/cuda/matmul_compute.cc
+++ b/lite/kernels/cuda/matmul_compute.cc
--- a/lite/kernels/cuda/matmul_compute.h
+++ b/lite/kernels/cuda/matmul_compute.h
--- a/lite/kernels/cuda/matmul_compute_test.cc
+++ b/lite/kernels/cuda/matmul_compute_test.cc
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
--- a/lite/kernels/cuda/sequence_mask_compute_test.cc
+++ b/lite/kernels/cuda/sequence_mask_compute_test.cc
--- a/lite/kernels/cuda/sequence_pad_compute.cu
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
--- a/lite/kernels/cuda/sequence_pad_compute.h
+++ b/lite/kernels/cuda/sequence_pad_compute.h
--- a/lite/kernels/cuda/sequence_pad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_pad_compute_test.cc
--- a/lite/kernels/cuda/sequence_unpad_compute.cu
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.h
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
--- a/lite/kernels/cuda/sequence_unpad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc
--- a/lite/kernels/cuda/topk_pooling_compute.cu
+++ b/lite/kernels/cuda/topk_pooling_compute.cu
--- a/lite/kernels/cuda/topk_pooling_compute.h
+++ b/lite/kernels/cuda/topk_pooling_compute.h
--- a/lite/kernels/cuda/topk_pooling_compute_test.cc
+++ b/lite/kernels/cuda/topk_pooling_compute_test.cc
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/activation_grad_compute.cc
+++ b/lite/kernels/host/activation_grad_compute.cc
--- a/lite/kernels/host/activation_grad_compute.h
+++ b/lite/kernels/host/activation_grad_compute.h
--- a/lite/kernels/host/retinanet_detection_output_compute.cc
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
--- a/lite/kernels/host/retinanet_detection_output_compute.h
+++ b/lite/kernels/host/retinanet_detection_output_compute.h
--- a/lite/kernels/host/where_index_compute.cc
+++ b/lite/kernels/host/where_index_compute.cc
--- a/lite/kernels/host/where_index_compute.h
+++ b/lite/kernels/host/where_index_compute.h
--- a/lite/kernels/host/where_index_compute_test.cc
+++ b/lite/kernels/host/where_index_compute_test.cc
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/argmax_op.cc
+++ b/lite/kernels/mlu/bridges/argmax_op.cc
--- a/lite/kernels/mlu/bridges/argmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/argmax_op_test.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/cast_op.cc
+++ b/lite/kernels/mlu/bridges/cast_op.cc
--- a/lite/kernels/mlu/bridges/cast_op_test.cc
+++ b/lite/kernels/mlu/bridges/cast_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/dropout_op.cc
+++ b/lite/kernels/mlu/bridges/dropout_op.cc
--- a/lite/kernels/mlu/bridges/dropout_op_test.cc
+++ b/lite/kernels/mlu/bridges/dropout_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/flatten_op.cc
+++ b/lite/kernels/mlu/bridges/flatten_op.cc
--- a/lite/kernels/mlu/bridges/flatten_op_test.cc
+++ b/lite/kernels/mlu/bridges/flatten_op_test.cc
--- a/lite/kernels/mlu/bridges/gather_op.cc
+++ b/lite/kernels/mlu/bridges/gather_op.cc
--- a/lite/kernels/mlu/bridges/gather_op_test.cc
+++ b/lite/kernels/mlu/bridges/gather_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/layout_op.cc
+++ b/lite/kernels/mlu/bridges/layout_op.cc
--- a/lite/kernels/mlu/bridges/layout_op_test.cc
+++ b/lite/kernels/mlu/bridges/layout_op_test.cc
--- a/lite/kernels/mlu/bridges/lrn_op.cc
+++ b/lite/kernels/mlu/bridges/lrn_op.cc
--- a/lite/kernels/mlu/bridges/lrn_op_test.cc
+++ b/lite/kernels/mlu/bridges/lrn_op_test.cc
--- a/lite/kernels/mlu/bridges/norm_op.cc
+++ b/lite/kernels/mlu/bridges/norm_op.cc
--- a/lite/kernels/mlu/bridges/norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/norm_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/reshape_op.cc
+++ b/lite/kernels/mlu/bridges/reshape_op.cc
--- a/lite/kernels/mlu/bridges/reshape_op_test.cc
+++ b/lite/kernels/mlu/bridges/reshape_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/mlu/bridges/slice_op.cc
+++ b/lite/kernels/mlu/bridges/slice_op.cc
--- a/lite/kernels/mlu/bridges/slice_op_test.cc
+++ b/lite/kernels/mlu/bridges/slice_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
--- a/lite/kernels/mlu/bridges/split_op.cc
+++ b/lite/kernels/mlu/bridges/split_op.cc
--- a/lite/kernels/mlu/bridges/split_op_test.cc
+++ b/lite/kernels/mlu/bridges/split_op_test.cc
--- a/lite/kernels/mlu/bridges/squeeze_op.cc
+++ b/lite/kernels/mlu/bridges/squeeze_op.cc
--- a/lite/kernels/mlu/bridges/squeeze_op_test.cc
+++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
--- a/lite/kernels/mlu/bridges/transpose_op.cc
+++ b/lite/kernels/mlu/bridges/transpose_op.cc
--- a/lite/kernels/mlu/bridges/transpose_op_test.cc
+++ b/lite/kernels/mlu/bridges/transpose_op_test.cc
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/transpose_image_compute.cc
+++ b/lite/kernels/opencl/transpose_image_compute.cc
--- a/lite/kernels/opencl/transpose_image_compute_test.cc
+++ b/lite/kernels/opencl/transpose_image_compute_test.cc
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
--- a/lite/kernels/x86/activation_compute_test.cc
+++ b/lite/kernels/x86/activation_compute_test.cc
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
--- a/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/lite/kernels/x86/batch_norm_compute_test.cc
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
--- a/lite/kernels/x86/concat_compute_test.cc
+++ b/lite/kernels/x86/concat_compute_test.cc
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
--- a/lite/kernels/x86/dropout_compute_test.cc
+++ b/lite/kernels/x86/dropout_compute_test.cc
--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ b/lite/kernels/x86/elementwise_compute_test.cc
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
--- a/lite/kernels/x86/gelu_compute_test.cc
+++ b/lite/kernels/x86/gelu_compute_test.cc
--- a/lite/kernels/x86/gru_compute_test.cc
+++ b/lite/kernels/x86/gru_compute_test.cc
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/x86/matmul_compute_test.cc
+++ b/lite/kernels/x86/matmul_compute_test.cc
--- a/lite/kernels/x86/mul_compute_test.cc
+++ b/lite/kernels/x86/mul_compute_test.cc
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
--- a/lite/kernels/x86/relu_compute_test.cc
+++ b/lite/kernels/x86/relu_compute_test.cc
--- a/lite/kernels/x86/reshape_compute_test.cc
+++ b/lite/kernels/x86/reshape_compute_test.cc
--- a/lite/kernels/x86/scale_compute_test.cc
+++ b/lite/kernels/x86/scale_compute_test.cc
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
--- a/lite/kernels/x86/search_grnn_compute_test.cc
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_expand_as_compute_test.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ b/lite/kernels/x86/sequence_pool_compute_test.cc
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/sgd_compute.cc
+++ b/lite/kernels/x86/sgd_compute.cc
--- a/lite/kernels/x86/shape_compute_test.cc
+++ b/lite/kernels/x86/shape_compute_test.cc
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ b/lite/kernels/x86/softmax_compute_test.cc
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
--- a/lite/kernels/x86/tanh_compute_test.cc
+++ b/lite/kernels/x86/tanh_compute_test.cc
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
--- a/lite/kernels/x86/transpose_compute_test.cc
+++ b/lite/kernels/x86/transpose_compute_test.cc
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__mmdnn_compute.cc
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
--- a/lite/kernels/xpu/__xpu__search_attention_compute.cc
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
--- a/lite/kernels/xpu/__xpu__search_attention_compute.h
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.h
--- a/lite/kernels/xpu/concat_compute.cc
+++ b/lite/kernels/xpu/concat_compute.cc
--- a/lite/kernels/xpu/concat_compute.h
+++ b/lite/kernels/xpu/concat_compute.h
--- a/lite/kernels/xpu/match_matrix_tensor_compute.cc
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
--- a/lite/kernels/xpu/match_matrix_tensor_compute.h
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.h
--- a/lite/kernels/xpu/search_fc_compute.cc
+++ b/lite/kernels/xpu/search_fc_compute.cc
--- a/lite/kernels/xpu/search_fc_compute.h
+++ b/lite/kernels/xpu/search_fc_compute.h
--- a/lite/kernels/xpu/search_grnn_compute.cc
+++ b/lite/kernels/xpu/search_grnn_compute.cc
--- a/lite/kernels/xpu/search_grnn_compute.h
+++ b/lite/kernels/xpu/search_grnn_compute.h
--- a/lite/kernels/xpu/sequence_arithmetic_compute.cc
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
--- a/lite/kernels/xpu/sequence_arithmetic_compute.h
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.h
--- a/lite/kernels/xpu/sequence_concat_compute.cc
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
--- a/lite/kernels/xpu/sequence_concat_compute.h
+++ b/lite/kernels/xpu/sequence_concat_compute.h
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
--- a/lite/kernels/xpu/sequence_pool_compute.h
+++ b/lite/kernels/xpu/sequence_pool_compute.h
--- a/lite/kernels/xpu/sequence_reverse_compute.cc
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
--- a/lite/kernels/xpu/sequence_reverse_compute.h
+++ b/lite/kernels/xpu/sequence_reverse_compute.h
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/kernels/xpu/var_conv_2d_compute.cc
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
--- a/lite/kernels/xpu/var_conv_2d_compute.h
+++ b/lite/kernels/xpu/var_conv_2d_compute.h
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/desc_apis.h
--- a/lite/model_parser/base/block_desc.h
+++ b/lite/model_parser/base/block_desc.h
--- a/lite/model_parser/base/op_desc.h
+++ b/lite/model_parser/base/op_desc.h
--- a/lite/model_parser/base/program_desc.h
+++ b/lite/model_parser/base/program_desc.h
--- a/lite/model_parser/base/traits.h
+++ b/lite/model_parser/base/traits.h
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
--- a/lite/model_parser/compatibility.h
+++ b/lite/model_parser/compatibility.h
--- a/lite/model_parser/compatibility_test.cc
+++ b/lite/model_parser/compatibility_test.cc
--- a/lite/model_parser/compatible_pb.h
+++ b/lite/model_parser/compatible_pb.h
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/cpp_desc.h
+++ b/lite/model_parser/cpp_desc.h
--- a/lite/model_parser/flatbuffers/CMakeLists.txt
+++ b/lite/model_parser/flatbuffers/CMakeLists.txt
--- a/lite/model_parser/flatbuffers/block_desc.cc
+++ b/lite/model_parser/flatbuffers/block_desc.cc
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
--- a/lite/model_parser/flatbuffers/framework.fbs
+++ b/lite/model_parser/flatbuffers/framework.fbs
--- a/lite/model_parser/flatbuffers/op_desc.cc
+++ b/lite/model_parser/flatbuffers/op_desc.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.cc
+++ b/lite/model_parser/flatbuffers/program_desc.cc
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/var_desc.cc
+++ b/lite/model_parser/flatbuffers/var_desc.cc
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
--- a/lite/model_parser/flatbuffers/vector_view_test.cc
+++ b/lite/model_parser/flatbuffers/vector_view_test.cc
--- a/lite/model_parser/cpp/CMakeLists.txt
+++ b/lite/model_parser/cpp/CMakeLists.txt
--- a/lite/model_parser/cpp/block_desc.cc
+++ b/lite/model_parser/cpp/block_desc.cc
--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/cpp/block_desc.h
--- a/lite/model_parser/cpp/op_desc.cc
+++ b/lite/model_parser/cpp/op_desc.cc
--- a/lite/model_parser/cpp/op_desc.h
+++ b/lite/model_parser/cpp/op_desc.h
--- a/lite/model_parser/cpp/program_desc.cc
+++ b/lite/model_parser/cpp/program_desc.cc
--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/cpp/program_desc.h
--- a/lite/model_parser/cpp/var_desc.cc
+++ b/lite/model_parser/cpp/var_desc.cc
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/naive_buffer/block_desc.h
+++ b/lite/model_parser/naive_buffer/block_desc.h
--- a/lite/model_parser/naive_buffer/combined_params_desc.h
+++ b/lite/model_parser/naive_buffer/combined_params_desc.h
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
--- a/lite/model_parser/naive_buffer/param_desc.h
+++ b/lite/model_parser/naive_buffer/param_desc.h
--- a/lite/model_parser/naive_buffer/program_desc.h
+++ b/lite/model_parser/naive_buffer/program_desc.h
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
--- a/lite/model_parser/pb/block_desc.h
+++ b/lite/model_parser/pb/block_desc.h
--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
--- a/lite/model_parser/pb/program_desc.h
+++ b/lite/model_parser/pb/program_desc.h
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__mmdnn_op.cc
+++ b/lite/operators/__xpu__mmdnn_op.cc
--- a/lite/operators/__xpu__mmdnn_op.h
+++ b/lite/operators/__xpu__mmdnn_op.h
--- a/lite/operators/__xpu__resnet_cbam_op.cc
+++ b/lite/operators/__xpu__resnet_cbam_op.cc
--- a/lite/operators/__xpu__resnet_cbam_op.h
+++ b/lite/operators/__xpu__resnet_cbam_op.h
--- a/lite/operators/__xpu__search_attention_op.cc
+++ b/lite/operators/__xpu__search_attention_op.cc
--- a/lite/operators/__xpu__search_attention_op.h
+++ b/lite/operators/__xpu__search_attention_op.h
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
--- a/lite/operators/affine_grid_op.cc
+++ b/lite/operators/affine_grid_op.cc
--- a/lite/operators/affine_grid_op.h
+++ b/lite/operators/affine_grid_op.h
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
--- a/lite/operators/clip_op.cc
+++ b/lite/operators/clip_op.cc
--- a/lite/operators/clip_op.h
+++ b/lite/operators/clip_op.h
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pixel_shuffle_op.cc
+++ b/lite/operators/pixel_shuffle_op.cc
--- a/lite/operators/pixel_shuffle_op.h
+++ b/lite/operators/pixel_shuffle_op.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/retinanet_detection_output_op.cc
+++ b/lite/operators/retinanet_detection_output_op.cc
--- a/lite/operators/retinanet_detection_output_op.h
+++ b/lite/operators/retinanet_detection_output_op.h
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
--- a/lite/operators/sequence_mask_op.cc
+++ b/lite/operators/sequence_mask_op.cc
--- a/lite/operators/sequence_mask_op.h
+++ b/lite/operators/sequence_mask_op.h
--- a/lite/operators/sequence_pad_op.cc
+++ b/lite/operators/sequence_pad_op.cc
--- a/lite/operators/sequence_pad_op.h
+++ b/lite/operators/sequence_pad_op.h
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
--- a/lite/operators/topk_pooling_op.cc
+++ b/lite/operators/topk_pooling_op.cc
--- a/lite/operators/topk_pooling_op.h
+++ b/lite/operators/topk_pooling_op.h
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/where_index_op.cc
+++ b/lite/operators/where_index_op.cc
--- a/lite/operators/where_index_op.h
+++ b/lite/operators/where_index_op.h
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
--- a/lite/tests/api/test_fpr_lite_xpu.cc
+++ b/lite/tests/api/test_fpr_lite_xpu.cc
--- a/lite/tests/api/test_mmdnn_lite_xpu.cc
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
--- a/lite/tests/kernels/box_clip_compute_test.cc
+++ b/lite/tests/kernels/box_clip_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/clip_compute_test.cc
+++ b/lite/tests/kernels/clip_compute_test.cc
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
--- a/lite/tests/kernels/multiclass_nms_compute_test.cc
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ b/lite/tests/kernels/roi_align_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/sequence_conv_compute_test.cc
+++ b/lite/tests/kernels/sequence_conv_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/utils/all.h
+++ b/lite/utils/all.h
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/lite/utils/factory.h
+++ b/lite/utils/factory.h
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
--- a/lite/utils/md5.h
+++ b/lite/utils/md5.h
--- a/lite/utils/paddle_enforce.h
+++ b/lite/utils/paddle_enforce.h
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
--- a/flatbuffers @ 6df40a24
+++ b/flatbuffers @ 6df40a24