mereged with upstream/develope

e5f6ce88 · chonwhite · 0c1bde02 · e5f6ce88 · e5f6ce88 · e5f6ce88
1000 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -63,6 +63,16 @@ test/models/

 test/images/

+*.pyc
+
+# model
+*.nb
+*.svg
+*.dot
+
+# vim intermediate files
+*.swp
+
 # Emacs intermediate files
 *~

@@ -105,3 +115,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+
+build*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")

+
+if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+    set(CMAKE_SUPPRESS_REGENERATION ON)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+
+    if (MSVC_STATIC_CRT)
+      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
+
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    add_compile_options(/MP)
+    message(STATUS "Using parallel compiling (/MP)")
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+
+endif()
+
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    find_package(CUDA QUIET)
 endif()
@@ -62,15 +87,20 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_APU  "Enable APU in lite mode"  OFF)
+lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
-lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
-lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
+lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
+lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options
@@ -79,6 +109,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
 lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)
@@ -104,9 +135,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
+    if(WIN32)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+    else()
+    
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
            FORCE)
+    endif()
 endif()
 message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")

@@ -128,12 +166,18 @@ if (LITE_WITH_PYTHON)
    include(external/pybind11)    # download, build, install pybind11
 endif()

+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
+

 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    message(STATUS "Building the mobile framework")
    include(cross_compiling/postproject)
-    include(cross_compiling/npu) # check and prepare NPU DDK
+    include(device/npu) # check and prepare NPU DDK
+    include(device/xpu) # check and prepare XPU SDK
+    include(device/apu) # check and prepare APU SDK

    # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
    # So the following third party dependencies are not needed.
@@ -174,11 +218,17 @@ endif()
 ########################################################################################

 if(LITE_WITH_XPU)
-    include(xpu)
+    include(device/xpu)
 endif()

+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
+include(coveralls)
+
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
+
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -203,7 +253,9 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
-include(flags)
+if(NOT APPLE)
+  include(flags)
+endif()

 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

--- a/README.md
+++ b/README.md
@@ -3,14 +3,14 @@
 # Paddle Lite

 <!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 <!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->


 Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources.

-For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/).
+For tutorials, please see [PaddleLite Document](https://paddle-lite.readthedocs.io/zh/latest/).

 ## Key Features

@@ -61,7 +61,8 @@ For demands of Apple's GPU Metal and web front end inference, please see `./meta
 Paddle Lite has referenced the following open-source projects:

 - [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. 
+- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite.  
+


 ## Feedback and Community Support

--- a/README_cn.md
+++ b/README_cn.md
 #  Paddle Lite

 <!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 <!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->

 Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在内更多场景的轻量化高效预测，支持更广泛的硬件和平台，是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外，也兼容支持其他训练框架产出的模型。

-完整使用文档位于 [PaddleLite 文档](https://paddlepaddle.github.io/Paddle-Lite/) 。
+完整使用文档位于 [PaddleLite 文档](https://paddle-lite.readthedocs.io/zh/latest/) 。

 ## 特性


--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
    set(SIMD_FLAG ${SSE3_FLAG})
 endif()

+if(WIN32)
+  # windows header option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+  
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+endif(WIN32)
+
 if(LITE_WITH_CUDA)
    add_definitions(-DLITE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)
@@ -70,7 +79,7 @@ endif()

 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
+    if(WIN32 OR APPLE)
        # openmp not support well for now on windows
        set(OPENMP_FLAGS "")
    else(WIN32)
@@ -122,6 +131,9 @@ if (LITE_WITH_ARM)
    endif()
 endif()

+if (LITE_WITH_TRAIN)
+    add_definitions("-DLITE_WITH_TRAIN")
+endif()

 if (WITH_ARM_DOTPROD)
    add_definitions("-DWITH_ARM_DOTPROD")
@@ -131,8 +143,19 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()

+if (LITE_WITH_APU)
+    add_definitions("-DLITE_WITH_APU")
+endif()
+
+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
+
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()

 if (LITE_WITH_OPENCL)
@@ -147,19 +170,24 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()

+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
-    if (LITE_WITH_PRECISION_PROFILE)
-        add_definitions("-DLITE_WITH_PRECISION_PROFILE")
-    endif()
+endif()
+
+if (LITE_WITH_PRECISION_PROFILE)
+    add_definitions("-DLITE_WITH_PRECISION_PROFILE")
 endif()

 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
  add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
 endif()

-if (LITE_SHUTDOWN_LOG)
-  add_definitions("-DLITE_SHUTDOWN_LOG")
+if (LITE_WITH_LOG)
+  add_definitions("-DLITE_WITH_LOG")
 endif()

 if (LITE_ON_TINY_PUBLISH)
@@ -170,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
  add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)

+if (LITE_WITH_PYTHON)
+  add_definitions("-DLITE_WITH_PYTHON")
+endif(LITE_WITH_PYTHON)
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -20,6 +20,9 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
    # will be converted from the format "1;2;3" to "1 2 3".
    set(COVERAGE_SRCS "")
    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
+        if ("${SINGLE_SRC}" MATCHES "/Paddle-Lite/third-party/*")
+            continue()
+        endif()
        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
    endforeach()

@@ -62,7 +65,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
 endfunction()

 if(WITH_COVERAGE)
-    set(CMAKE_BUILD_TYPE "Debug")
+    #set(CMAKE_BUILD_TYPE "Debug")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")

@@ -95,9 +98,11 @@ if(WITH_COVERAGE)
        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
    endforeach()

+    set(COVERALLS_UPLOAD ON)
    code_coverage(
        "${PADDLE_SRCS}"
        ${COVERALLS_UPLOAD}
        "${PROJECT_SOURCE_DIR}/cmake"
    )
 endif()
+
--- a/cmake/cross_compiling/findar.cmake
+++ b/cmake/cross_compiling/findar.cmake
@@ -23,7 +23,7 @@ endif()

 get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)

-find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
+find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH} NO_DEFAULT_PATH)

 if(NOT AR_TOOL)
    message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -57,10 +57,14 @@ function(check_linker_flag)
    endforeach()
    set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
 endfunction()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (LITE_ON_TINY_PUBLISH)
-    if(NOT LITE_WITH_PYTHON)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+    if((NOT LITE_WITH_PYTHON))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+    endif()
+    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
    endif()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")

--- a/cmake/device/apu.cmake
+++ b/cmake/device/apu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+if(NOT DEFINED APU_DDK_ROOT)
+    set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
+    if(NOT APU_DDK_ROOT)
+        message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
+    endif()
+endif()
+
+message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
+find_path(APU_DDK_INC NAMES NeuronAdapter.h
+  PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
+if(NOT APU_DDK_INC)
+  message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
+endif()
+message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
+
+include_directories("${APU_DDK_ROOT}/include")
--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/cross_compiling/npu.cmake
@@ -17,15 +17,16 @@ if(NOT LITE_WITH_NPU)
 endif()

 if(NOT DEFINED NPU_DDK_ROOT)
-    set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
-    if(NOT NPU_DDK_ROOT)
-        message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
-    endif()
+  set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
+  if(NOT NPU_DDK_ROOT)
+    message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
+  endif()
 endif()

 message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}")
 find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h
-  PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH)
+  PATHS ${NPU_DDK_ROOT}/include
+  NO_DEFAULT_PATH)
 if(NOT NPU_DDK_INC)
  message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
 endif()
@@ -34,21 +35,24 @@ include_directories("${NPU_DDK_ROOT}/include")

 set(NPU_SUB_LIB_PATH "lib64")
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-    set(NPU_SUB_LIB_PATH "lib64")
+  set(NPU_SUB_LIB_PATH "lib64")
 endif()

 if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-    set(NPU_SUB_LIB_PATH "lib")
+  set(NPU_SUB_LIB_PATH "lib")
 endif()

 find_library(NPU_DDK_HIAI_FILE NAMES hiai
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)

 find_library(NPU_DDK_IR_FILE NAMES hiai_ir
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)

 find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)

 if(NOT NPU_DDK_HIAI_FILE)
  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
@@ -76,6 +80,3 @@ endif()

 set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
-
-
-
--- a/cmake/device/rknpu.cmake
+++ b/cmake/device/rknpu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+
+include_directories("${RKNPU_DDK_ROOT}/include")
+
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
--- a/cmake/xpu.cmake
+++ b/cmake/xpu.cmake
@@ -17,46 +17,18 @@ if(NOT LITE_WITH_XPU)
 endif()

 if(NOT DEFINED XPU_SDK_ROOT)
-    set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
-    if(NOT XPU_SDK_ROOT)
-        message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
-    endif()
+  set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
+  if(NOT XPU_SDK_ROOT)
+    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
+  endif()
 endif()
-
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()

-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")

-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so)
-
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so)
-
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
-
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)

 if(NOT XPU_SDK_XPU_API_FILE)
  message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}")
@@ -67,7 +39,8 @@ else()
 endif()

 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)

 if(NOT XPU_SDK_XPU_RT_FILE)
  message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}")
@@ -77,29 +50,55 @@ else()
  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()

-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")

-if(NOT XPU_SDK_XPU_JITC_FILE)
-  message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}")
-  add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE})
-endif()
+if(LITE_WITH_XTCL)
+    find_path(XPU_SDK_INC NAMES xtcl.h
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+    endif()
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
+
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()

-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)

-if(NOT XPU_SDK_LLVM_FILE)
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
-endif()
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0")
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()

-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
+endif()
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -36,7 +36,16 @@ else()
        # eigen on cuda9.1 missing header of math_funtions.hpp
        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
        GIT_TAG
-        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen before v2.3.0
+        # URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen since  v2.6.0
+        #         github address: https://github.com/eigenteam/eigen-git-mirror
+        # we changed the source code to adapt for windows compiling
+        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+        ######################################################################################################
+        URL             http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})

-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -38,7 +32,17 @@ IF(WIN32)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+ELSEIF(APPLE)
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.dylib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.dylib)
 ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
+    #find_python_module(numpy REQUIRED)
    #find_python_module(wheel REQUIRED)
    #find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
+    #FIND_PACKAGE(NumPy REQUIRED)
    #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
    #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
    #    "please use pip to upgrade protobuf. pip install -U protobuf")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} mklml)
        if(WIN32)
          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        elseif(NOT APPLE)
          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
        endif(WIN32)
      endif()

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,18 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_APU)
+    foreach(var ${lite_deps_APU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  if (LITE_WITH_XPU)
    foreach(var ${lite_deps_XPU_DEPS})
      set(deps ${deps} ${var})
@@ -100,6 +112,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -125,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -136,14 +154,17 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )

    if (args_SHARED OR ARGS_shared)
@@ -154,8 +175,10 @@ function(lite_cc_library TARGET)
    else()
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)

+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    # collect targets need to compile for lite
    if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
        add_dependencies(lite_compile_deps ${TARGET})
@@ -170,7 +193,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -183,15 +206,20 @@ function(lite_cc_binary TARGET)
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
-	    BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    if (NOT APPLE)
        # strip binary target to reduce size
        if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -218,7 +246,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -239,12 +267,15 @@ function(lite_cc_test TARGET)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -254,7 +285,9 @@ function(lite_cc_test TARGET)
                "${TARGET}"
                COMMENT "Strip debug symbols done on final executable file.")
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    file(APPEND ${offline_test_registry_file} "${TARGET}\n")

    # collect targets need to compile for lite
@@ -268,24 +301,32 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")

 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+
+# file to record faked kernels for opt python lib
+set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
+file(WRITE ${fake_kernels_src_list} "") # clean
+
 if(LITE_BUILD_TAILOR)
  set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -302,63 +343,106 @@ function(add_kernel TARGET device level)
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()
-
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
    endif()

-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-

    if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
        if (NOT LITE_WITH_ARM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "X86")
-        if (NOT LITE_WITH_X86)
+        if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "NPU")
        if (NOT LITE_WITH_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "APU")
+        if (NOT LITE_WITH_APU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "FPGA")
        if (NOT LITE_WITH_FPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "BM")
        if (NOT LITE_WITH_BM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
@@ -366,6 +450,9 @@ function(add_kernel TARGET device level)

    if ("${device}" STREQUAL "CUDA")
        if (NOT LITE_WITH_CUDA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
@@ -389,8 +476,11 @@ function(add_kernel TARGET device level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -409,16 +499,18 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()

+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
+    endif()

    foreach(src ${args_SRCS})
      if(LITE_BUILD_TAILOR)
@@ -440,14 +532,40 @@ function(add_operator TARGET level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
      )
 endfunction()

+#only for windows 
+function(create_static_lib TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+    set(dummy_index 1)
+    set(dummy_offset 1)
+    # the dummy target would be consisted of limit size libraries
+    set(dummy_limit 60)
+    list(LENGTH libs libs_len)
+
+    foreach(lib ${libs})
+      list(APPEND dummy_list ${lib})
+      list(LENGTH dummy_list listlen)
+      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+        merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
+        set(dummy_list)
+        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
+        MATH(EXPR dummy_index "${dummy_index}+1")
+      endif()
+      MATH(EXPR dummy_offset "${dummy_offset}+1")
+    endforeach()
+    merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
+endfunction()

 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)
@@ -491,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
  set(bundled_tgt_full_name
    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})

-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  message(STATUS "bundled_tgt_full_name:  ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  
+  if(WIN32)
+    set(dummy_tgt_name dummy_${bundled_tgt_name})
+    create_static_lib(${bundled_tgt_name} ${static_libs})
+    add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
+    add_dependencies(${fake_target} ${tgt_name})
+  
+    add_library(${dummy_tgt_name} STATIC IMPORTED)
+    set_target_properties(${dummy_tgt_name}
+      PROPERTIES
+        IMPORTED_LOCATION ${bundled_tgt_full_name}
+        INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+    add_dependencies(${dummy_tgt_name} ${fake_target})
+    return()
+  endif()

  if(NOT IOS)
    file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in

--- a/cmake/mlu.cmake
+++ b/cmake/mlu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_MLU)
+  return()
+endif()
+
+if(NOT DEFINED NEUWARE_HOME)
+    set(NEUWARE_HOME $ENV{NEUWARE_HOME})
+    if(NOT NEUWARE_HOME)
+        message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON")
+    endif()
+endif()
+
+message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}")
+find_path(CNML_INC NAMES cnml.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNML_INC)
+  message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include")
+endif()
+
+find_path(CNRT_INC NAMES cnrt.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNRT_INC)
+  message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
+endif()
+
+include_directories("${NEUWARE_HOME}/include")
+
+find_library(CNML_LIB_FILE NAMES cnml
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNML_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNML Library: ${CNML_LIB_FILE}")
+  add_library(cnml_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE})
+endif()
+
+find_library(CNRT_LIB_FILE NAMES cnrt
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNRT_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}")
+  add_library(cnrt_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
+endif()
--- a/docs/advanced_user_guides/index.rst
+++ b/docs/advanced_user_guides/index.rst
--- a/docs/advanced_user_guides/model_quantization.md
+++ b/docs/advanced_user_guides/model_quantization.md
-# 模型量化
-
-本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型，并进行推理执行。我们以MobileNetV1模型为示例，首先介绍准备量化模型，然后介绍部署执行。
-
-## 准备量化模型
-
-PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型，下面分别介绍两种方法如何产出量化模型。
-
-### 量化训练
-
-目前，PaddlePaddle框架的量化训练主要针对卷积层（包括二维卷积和Depthwise卷积）、和全连接层，对应算子是conv2d、depthwise_conv2d和mul，更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型，可以进一步加快模型在移动端的执行速度。
-
-温馨提示：如果您是初次接触PaddlePaddle框架，建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。
-
-
-您可以选择下载训练好的量化模型，或者使用PaddleSlim模型压缩工具训练得到量化模型。
-
-#### 下载量化模型
-
-官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip)，直接下载到本地。
-
-```bash
-wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip
-```
-
-#### 使用PaddleSlim模型压缩工具训练量化模型
-
-##### 安装PaddlePaddle
-
-根据操作系统、安装方式、Python版本和CUDA版本，按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如：
-
-Ubuntu 16.04.4 LTS操作系统，CUDA9，cuDNN7，GPU版本安装:
-```bash
-pip install paddlepaddle-gpu==1.6.0.post97 -i https://mirrors.aliyun.com/pypi/simple/
-```
-
-Ubuntu 16.04.4 LTS操作系统，CPU版本安装:
-```bash
-pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/
-```
-
-##### 克隆量化训练所需的代码库
-
-克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地，并进入models/PaddleSlim路径。
-
-```bash
-git clone https://github.com/PaddlePaddle/models.git
-cd models/PaddleSlim
-```
-
-##### 数据准备
-###### 训练数据准备
-
-参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程，下载训练数据，并且保存到PaddleSlim/data路径下。
-
-###### 预训练模型准备
-
-参考/models/PaddleSlim/run.sh脚本， 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型，并保存到PaddleSlim/pretrain路径下。
-
-经过以上三步，PaddleSlim目录下的文件结构如下所示：
-
-```bash
-.
-├── compress.py # 模型压缩任务主脚本，定义了压缩任务需要的模型相关信息
-├── configs # 压缩任务的配置文件，包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件
-├── data # 存放训练数据（需要用户自己创建）
-│   └── ILSVRC2012
-├── pretrain # 存放预训练模型参数，执行run.sh自动生成
-│   ├── MobileNetV1_pretrained
-│   ├── MobileNetV1_pretrained.tar
-│   ├── ResNet50_pretrained
-│   └── ResNet50_pretrained.tar
-├── docs # 文档目录
-├── light_nas
-├── models # 模型网络结构的定义，如MobileNetV1
-├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程，适用于高阶用户
-├── reader.py # 定义数据处理逻辑
-├── README.md
-├── run.sh # 模型压缩任务启动脚本
-└── utility.py # 定义了常用的工具方法
-```
-
-##### 压缩脚本介绍
-
-在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息，这里对几个关键的步骤进行简要介绍：
-
-###### 目标网络的定义
-
-compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
-```python
-out = model.net(input=image, class_dim=args.class_dim)
-cost = fluid.layers.cross_entropy(input=out, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-```
-
-然后，通过clone方法得到eval_program, 用来在压缩过程中评估模型精度，如下：
-
-```python
-val_program = fluid.default_main_program().clone()
-```
-
-定义完目标网络结构，需要对其初始化，并根据需要加载预训练模型。
-
-###### 定义feed_list和fetch_list
-对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时，需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心，则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
-```python
-train_feed_list = [('image', image.name), ('label', label.name)]
-train_fetch_list = [('loss', avg_cost.name)]
-```
-
-> 注意： 在train_fetch_list里必须有loss这一项。
-
-对于eval program. 同上定义eval_feed_list和train_fetch_list:
-
-```python
-val_feed_list = [('image', image.name), ('label', label.name)]
-val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
-```
-
-###### Compressor和量化配置文件
-`compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下：
-```python
-class Compressor(object):
-    def __init__(self,
-                 place,
-                 scope,
-                 train_program,
-                 train_reader=None,
-                 train_feed_list=None,
-                 train_fetch_list=None,
-                 eval_program=None,
-                 eval_reader=None,
-                 eval_feed_list=None,
-                 eval_fetch_list=None,
-                 teacher_programs=[],
-                 checkpoint_path='./checkpoints',
-                 train_optimizer=None,
-                 distiller_optimizer=None):
-```
-
-在定义Compressor对象时，需要注意以下问题：
-* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None.
-* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。
-* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以，如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators.
-* checkpoint保存的是float数据类型的模型。
-
-`configs/quantization.yaml`量化配置文件示例如下：
-
-```python
-version: 1.0
-strategies:
-    quantization_strategy:
-        class: 'QuantizationStrategy'
-        start_epoch: 0
-        end_epoch: 9
-        float_model_save_path: './output/float'
-        mobile_model_save_path: './output/mobile'
-        int8_model_save_path: './output/int8'
-        weight_bits: 8
-        activation_bits: 8
-        weight_quantize_type: 'abs_max'
-        activation_quantize_type: 'moving_average_abs_max'
-        save_in_nodes: ['image']
-        save_out_nodes: ['fc_0.tmp_2']
-compressor:
-    epoch: 10
-    checkpoint_path: './checkpoints_quan/'
-    strategies:
-        - quantization_strategy
-```
-其中，可配置参数包括：
- **class:** 量化策略的类名称，目前仅支持`QuantizationStrategy`。
- **start_epoch:** 在start_epoch开始之前，量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始，进入量化训练阶段。
- **end_epoch:** 在end_epoch结束之后，会保存用户指定格式的模型。注意：end_epoch之后并不会停止量化训练，而是继续训练直到epoch数等于compressor.epoch值为止。举例来说，当start_epoch=0，end_epoch=0，compressor.epoch=2时，量化训练开始于epoch0，结束于epoch1，但保存的模型是epoch0结束时的参数状态。
- **float_model_save_path:**  保存float数据格式的模型路径，即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型，默认为None。**注意：Paddle-Lite即使用该目录下的模型进行量化模型推理优化，详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。**
- **int8_model_save_path:** 保存int8数据格式的模型路径，即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型，默认为None.
- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型，默认为None。目前paddle-mobile已升级为Paddle-Lite。
- **weight_bits:** 量化weight的bit数，注意偏置(bias)参数不会被量化。
- **activation_bits:** 量化activation的bit数。
-  **weight_quantize_type:** weight量化方式，目前量化训练支持`abs_max`、 `channel_wise_abs_max`。
- **activation_quantize_type:** activation量化方式，目前量化训练支持`range_abs_max`、`moving_average_abs_max`。PaddlePaddle中还支持 `abs_max` 方法对激活进行量化，但是该方法动态计算输入的量化scale，这会增加计算量、减慢模型推理速度，所以lite不支持 `abs_max`激活量化方式。
- **save_in_nodes:** variable名称列表。在保存量化后模型的时候，需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。
- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候，需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。
-
-> **备注：**
->
-> 1）`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似，不同点在于它会对卷积权重进行分channel求取量化scale。换言之，`abs_max`属于tensor-wise量化，而`channel_wise_abs_max`属于channel-wise量化，详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。
-> 
-> 2）`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值，并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale，而`range_abs_max`则使用窗口绝对值最大值的方式。
-> 
-> 3）**目前，Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
-
-##### 执行int8量化训练
-
-修改run.sh，即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令（所需打开注释的命令如下所示）。
-
-```bash
-# for quantization
-#---------------------------
-export CUDA_VISIBLE_DEVICES=0
-python compress.py \
--batch_size 64 \
--model "MobileNet" \
--pretrained_model ./pretrain/MobileNetV1_pretrained \
--compress_config ./configs/quantization.yaml \
--quant_only True
-```
-最后，运行`sh run.sh`命令开始int8量化训练。
-
-上述量化训练过程完成后，若按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径，则可在models/PaddleSlim/output目录下看到`float`、`int8`和`mobile`三个目录，其中：
-* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。
-* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
-* mobile目录：参数特点与int8目录相同且兼容paddle-mobile的量化模型（目前paddle-mobile已升级为Paddle-Lite）。
-
-### 训练后量化
-
-下面以MobileNetV1为例，介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法，请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。
-
-> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例，首先clone下来[models](https://github.com/PaddlePaddle/models.git)，安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化，所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本，设置is_full_quantize=False，然后执行该脚本；执行结束后，量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。
-
-1）**准备模型和校准数据**
-
-安装PaddlePaddle的develop分支编译的whl包，准备已经训练好的FP32预测模型。
-
-准备校准数据，文件结构如下。val文件夹中有100张图片，val_list.txt文件中包含图片的label。
-```bash
-samples_100
-└──val
-└──val_list.txt
-```
-
-2）**配置校准数据生成器**
-
-MobileNetV1的输入是图片和标签，所以配置读取校准数据的sample_generator，每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。
-
-3）**调用训练后量化**
-
-调用训练后量化的核心代码如下，详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。
-``` python
-place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace()
-exe = fluid.Executor(place)
-sample_generator = reader.val(data_dir=args.data_path)
-
-ptq = PostTrainingQuantization(
-    executor=exe,
-    sample_generator=sample_generator,
-    model_dir=args.model_dir,
-    model_filename=args.model_filename,
-    params_filename=args.params_filename,
-    batch_size=args.batch_size,
-    batch_nums=args.batch_nums,
-    algo=args.algo,
-    is_full_quantize=args.is_full_quantize == "True")
-quantized_program = ptq.quantize()
-ptq.save_quantized_model(args.save_model_path)
-```
-
-## 使用Paddle-Lite运行量化模型推理
-
-#### 使用模型优化工具对量化模型进行优化
-
-接下来，使用原始的量化模型生成适合在移动端直接部署的模型。
-
-参考[源码编译](../source_compile)配置编译环境，确保可以编译成功。参考[模型转化方法](../model_optimize_tool)，首先编译model_optimize_tool工具，然后执行下面命令对量化训练的模型进行优化（注意，需要自行修改model_file、param_file和optimize_out）。
-```bash
-./model_optimize_tool                         \
--model_file=mobilenet_v1_quant/float/model   \
--param_file=mobilenet_v1_quant/float/weights \
--optimize_out_type=naive_buffer              \
--optimize_out=mobilenet_v1_quant_opt         \
--valid_targets=arm                           \
--prefer_int8_kernel=true
-```
-
-如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
-
-#### 在手机端准备量化模型文件
-
-使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端：
-
-```bash
-adb push mobilenet_v1_quant_opt /data/local/tmp
-```
-
-#### 使用mobilenetv1\_light\_api运行优化后的量化模型
-
-参考[源码编译](../source_compile)配置编译环境后，在Paddle-Lite执行如下命令获取轻量级API的demo：
-
-```bash
-cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
-make clean && make -j
-```
-执行完上述命令后，可在`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下：
-
-```bash
-adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp
-adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb shell /data/local/tmp/mobilenetv1_light_api               \
-    --model_dir=/data/local/tmp/mobilenet_v1_quant_opt
-```
-**程序运行结果如下：**
-```bash
-Output dim: 1000
-Output[0]: 0.000228
-Output[100]: 0.000260
-Output[200]: 0.000250
-Output[300]: 0.000560
-Output[400]: 0.000950
-Output[500]: 0.000275
-Output[600]: 0.005143
-Output[700]: 0.002509
-Output[800]: 0.000538
-Output[900]: 0.000969
-```
-在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
-
-### FAQ
-
-**问题**：Compiled with WITH_GPU, but no GPU found in runtime
-
-**解答**：检查本机是否支持GPU训练，如果不支持请使用CPU训练。如果在docker进行GPU训练，请使用nvidia_docker启动容器。
-
-**问题**：Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
-  
-**解答**：正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`，确保显卡剩余内存大于需要内存。
--- a/docs/advanced_user_guides/x86.md
+++ b/docs/advanced_user_guides/x86.md
-# 使用X86预测库
-
-Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。
-
-(注意：非docker Linux环境需要是Ubuntu16.04)
-
-## 编译
-
-1、 下载代码
-```bash
-git clone https://github.com/PaddlePaddle/Paddle-Lite.git
-#需要切换到 release/v2.0.0之后版本
-git checkout <release_tag>
-```
-
-2、 源码编译
-
-```bash
-cd Paddle-Lite
-./lite/tools/build.sh x86
-```
-
-## 编译结果说明
-
-x86编译结果位于 `build.lite.x86/inference_lite_lib`
-**具体内容**说明：
-
-1、 `bin`文件夹：可执行工具文件 `test_model_bin`
-
-2、 `cxx`文件夹：包含c++的库文件与相应的头文件
-
- `include`  : 头文件
- `lib` : 库文件
-  - 打包的静态库文件：
-    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
-    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
-  - 打包的动态态库文件：
-    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
-    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
-
-3、 `third_party` 文件夹：第三方库文件
-
-## x86预测API使用示例
-
-```c++
-#include <gflags/gflags.h>
-#include <iostream>
-#include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
-
-using namespace paddle::lite_api;  // NOLINT
-
-DEFINE_string(model_dir, "", "Model dir path.");
-DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
-
-int64_t ShapeProduction(const shape_t& shape) {
-  int64_t res = 1;
-  for (auto i : shape) res *= i;
-  return res;
-}
-void RunModel() {
-  // 1. Set CxxConfig
-  CxxConfig config;
-  config.set_model_file(FLAGS_model_dir + "model");
-  config.set_param_file(FLAGS_model_dir + "params");
-
-  config.set_valid_places({
-    lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
-  });
-
-  // 2. Create PaddlePredictor by CxxConfig
-  std::shared_ptr<PaddlePredictor> predictor =
-      CreatePaddlePredictor<CxxConfig>(config);
-
-  // 3. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(shape_t({1, 3, 224, 224}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
-  }
-
-  // 4. Run predictor
-  predictor->Run();
-
-  // 5. Get output
-  std::unique_ptr<const Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
-  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
-  }
-}
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
-  return 0;
-}
-```
--- a/docs/advanced_user_guides/cv.md
+++ b/docs/advanced_user_guides/cv.md
-# CV 图像预处理API接口介绍
+# CV图像预处理API

-请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`， 其他编译参数设置请参考[源码编译](../source_compile)， 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去，且会生成`paddle_image_preprocess.h`的API文件
+请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`， 其他编译参数设置请参考[源码编译](../user_guides/source_compile)， 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去，且会生成`paddle_image_preprocess.h`的API文件

 - 硬件平台： `ARM`
 - 操作系统：`MAC` 和 `LINUX`

--- a/docs/api_reference/cxx_api_doc.md
+++ b/docs/api_reference/cxx_api_doc.md

-# C++ API文档
+# C++ API

 ## CreatePaddlePredictor

@@ -260,14 +260,14 @@ class MobileConfig;

 `MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。

-*注意：输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
+*注意：输入的模型需要使用[Model Optimize Tool](../user_guides/model_optimize_tool)转化为NaiveBuffer格式的优化模型。*

 示例：

 ```c++
 MobileConfig config;
 // 设置NaiveBuffer格式模型目录，从文件加载模型时使用
-config.set_model_dir(FLAGS_model_dir);
+config.set_model_from_file(<your_model_path>);
 // 设置工作线程数
 config.set_threads(4);
 // 设置能耗模式
@@ -277,13 +277,13 @@ config.set_power_mode(LITE_POWER_HIGH);
 std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
 ```

-### `set_model_from_file(model_dir)`
+### `set_model_from_file(model_file)`

 设置模型文件，当需要从磁盘加载模型时使用。

 参数：

- `model_dir(std::string)` - 模型文件路径
+- `model_file(std::string)` - 模型文件路径

 返回：`None`

@@ -400,7 +400,7 @@ std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>

 - `None`

-返回：内存中模型结构数据
+返回：内存中模型参数数据

 返回类型：`const std::string&`

@@ -589,7 +589,7 @@ for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {

 根据名称获取输出Tensor的指针。

-**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
+**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../user_guides/model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。

 参数：


--- a/docs/api_reference/index.rst
+++ b/docs/api_reference/index.rst
--- a/docs/api_reference/java_api_doc.md
+++ b/docs/api_reference/java_api_doc.md
+# Java API
+
+## MobileConfig
+
+```java
+public class MobileConfig extends ConfigBase;
+```
+
+`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+*注意：输入的模型需要使用Model Optimize Tool转化为NaiveBuffer格式的优化模型。*
+
+示例：
+
+```java
+MobileConfig config = new MobileConfig();
+// 设置NaiveBuffer格式模型目录
+config.setModelFromFile(modelfile);
+// 设置能耗模式
+config.setPowerMode(PowerMode.LITE_POWER_HIGH);
+// 设置工作线程数
+config.setThreads(1);
+
+// 根据MobileConfig创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+```
+
+### ``setModelFromFile(model_file)``
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_file(String)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### ``setModelDir(model_dir)``
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_dir(String)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### ``setModelFromBuffer(model_buffer)``
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(str)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getModelDir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`String`
+
+
+
+### `setPowerMode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式。
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getPowerMode()`
+
+获取设置的CPU能耗模式。
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `setThreads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。*
+
+参数：
+
+- `threads(int)` - 工作线程数。默认为1。
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getThreads()`
+
+获取设置的工作线程数。
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## PaddlePredictor
+
+```java
+public class PaddlePredictor;
+```
+
+`PaddlePredictor`是Paddle-Lite的预测器。用户可以根据PaddlePredictor提供的接口使用MobileConfig创建新的预测器、设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```java
+// 设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelDir(modelPath);
+
+// 创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+Tensor input = predictor.getInput(0);
+input.resize(dims);
+input.setData(inputBuffer);
+
+// 执行预测
+predictor.run();
+
+// 获取输出数据
+Tensor output = predictor.getOutput(0);
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
+
+
+
+### `CreatePaddlePredictor(config)`
+
+```java
+public static PaddlePredictor createPaddlePredictor(ConfigBase config);
+```
+
+`CreatePaddlePredictor`用来根据`ConfigBase`动态创建预测器，目前Java API支持使用MobileConfig`。框架会根据您在config中指定的模型路径、能耗模型、工作线程数等自动创建一个预测器。
+
+参数：
+
+- `config(ConfigBase，目前应使用MobileConfig)` - 创建预测器的配置信息
+
+返回：根据config创建完成的预测器
+
+返回类型：`PaddlePredictor`
+
+
+
+### `getInput(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `getOutput(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出Tensor
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：预测执行状态，成功返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `getVersion()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`String`
+
+## PowerMode
+
+```java
+public enum PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```java
+MobileConfig config = new MobileConfig();
+// 设置NaiveBuffer格式模型目录
+config.setModelDir(modelPath);
+// 设置能耗模式
+config.setPowerMode(PowerMode.LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+public class Tensor;
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置维度、数据等。
+
+*注意：用户应使用`PaddlePredictor`的`getInput`和`getOuput`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```java
+// 导入Java API
+import com.baidu.paddle.lite.MobileConfig;
+import com.baidu.paddle.lite.Tensor;
+import com.baidu.paddle.lite.Predictor;
+import com.baidu.paddle.lite.PowerMode;
+
+// 设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelDir(modelPath);
+
+// 创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+// 获取输入Tensor
+Tensor input = predictor.getInput(0);
+// 设置输入维度
+input.resize(dims);
+// 设置输入数据
+input.setData(inputBuffer);
+
+// 执行预测
+predictor.run();
+
+// 获取输出Tensor
+Tensor result = predictor.getOutput(0);
+// 获取输出数据
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
+
+### `resize(dims)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `dims(long[])` - 维度信息
+
+返回：设置成功返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`long[]`
+
+
+
+### `setData(data)`
+
+设置Tensor数据。
+
+参数：
+
+- `data(float[])` - 需要设置的数据
+
+返回：成功则返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `getFloatData()`
+
+获取Tensor的底层float型数据。
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据
+
+返回类型：`float[]`
--- a/docs/api_reference/python_api/CxxConfig.md
+++ b/docs/api_reference/python_api/CxxConfig.md
+## CxxConfig
+
+```python
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数(该接口只支持armlinux)
+# config.set_threads(4);
+# 设置能耗模式(该接口只支持armlinux)
+# config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = lite.create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式，该接口只支持`armlinux`平台。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数，该接口只支持`armlinux`平台。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
--- a/docs/api_reference/python_api/CxxPredictor.md
+++ b/docs/api_reference/python_api/CxxPredictor.md
+## CxxPredictor
+
+```c++
+class CxxPredictor
+```
+
+`CxxPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
--- a/docs/api_reference/python_api/LightPredictor.md
+++ b/docs/api_reference/python_api/LightPredictor.md
+## LightPredictor
+
+```c++
+class LightPredictor
+```
+
+`LightPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from __future__ import print_function
+from paddlelite.lite import *
+
+# 1. 设置MobileConfig
+config = MobileConfig()
+config.set_model_dir(args.model_dir)
+
+# 2. 创建LightPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
--- a/docs/api_reference/python_api/MobileConfig.md
+++ b/docs/api_reference/python_api/MobileConfig.md
+## MobileConfig
+
+```python
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建LightPredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_from_file(<your_model_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_from_file(model_file)`
+
+**注意**：`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(str)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数，该接口只支持`armlinux`平台。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
--- a/docs/api_reference/python_api/PowerMode.md
+++ b/docs/api_reference/python_api/PowerMode.md
+## PowerMode
+
+```python
+class PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_dir(<your_model_dir_path>)
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
--- a/docs/api_reference/python_api/Tensor.md
+++ b/docs/api_reference/python_api/Tensor.md
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```python
+from paddlelite.lite import *
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(list)` - 维度信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`list`
+
+
+
+### `float_data()`
+
+获取Tensor的持有的float型数据。
+
+示例：
+
+```python
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`持有的float型数据
+
+返回类型：`list`
+
+
+
+### `set_float_data(float_data)`
+
+设置Tensor持有float数据。
+
+示例：
+
+```python
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+```
+
+参数：
+
+- `float_data(list)` - 待设置的float型数据
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_lod(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(list[list])` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`list[list]`
--- a/docs/api_reference/python_api/TypePlace.md
+++ b/docs/api_reference/python_api/TypePlace.md
+## TargetType
+
+```python
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```python
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```python
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```python
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```python
+from lite_core import *
+
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
--- a/docs/api_reference/python_api/create_paddle_predictor.md
+++ b/docs/api_reference/python_api/create_paddle_predictor.md
+
+## create_paddle_predictor
+
+```python
+CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型
+LightPredictor create_paddle_predictor(config); # config为MobileConfig类型
+```
+
+`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+# 设置CxxConfig
+config = CxxConfig()
+config.set_model_dir(<your_model_dir_path>)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+参数：
+
+- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：预测器`predictor`
+
+返回类型：`CxxPredictor`或`LightPredictor`
--- a/docs/api_reference/python_api/opt.md
+++ b/docs/api_reference/python_api/opt.md
+## Opt
+
+```python
+class Opt;
+```
+
+`Opt`模型离线优化接口，Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。
+
+示例：  
+
+假设待转化模型问当前文件夹下的`mobilenet_v1`，可以使用以下脚本转换
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+
+# 1. 创建opt实例
+opt=Opt()
+# 2. 指定输入模型地址 
+opt.set_model_dir("./mobilenet_v1")
+# 3. 指定转化类型： arm、x86、opencl、xpu、npu
+opt.set_valid_places("arm")
+# 4. 指定模型转化类型： naive_buffer、protobuf
+opt.set_model_type("naive_buffer")
+# 4. 输出模型地址
+opt.set_optimize_out("mobilenetv1_opt")
+# 5. 执行模型优化
+opt.run()
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+
+### `set_model_type(type)`
+
+设置模型的输出类型，当前支持`naive_buffer`和`protobuf`两种格式，移动端预测需要转化为`naive_buffer`
+
+参数：
+
+- `type(str)` - 模型格式（`naive_buffer/protobuf`)
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(str)` - 可用place列表，不同place用`,`隔开
+
+示例：
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+
+# 1. 创建opt实例
+opt=Opt()
+# 2. 指定转化类型： arm、x86、opencl、xpu、npu
+opt.set_valid_places("arm, opencl")
+```
+
+
+
+
+### `set_optimize_out(optimized_model_name)`
+
+设置优化后模型的名称，优化后模型文件以`.nb`作为文件后缀。
+
+参数：
+
+- `optimized_model_name(str)`
+
+### `run()`
+
+执行模型优化，用以上接口设置完 `模型路径`、`model_type`、`optimize_out`和`valid_places`后，执行`run()`接口会根据以上设置转化模型，转化后模型保存在当前路径下。
+
+
+### `run_optimize(model_dir, model_file, param_file, type, valid_places, optimized_model_name)`
+
+执行模型优化，无需设置以上接口，直接指定 `模型路径`、`model_type`、`optimize_out`和`valid_places`并执行模型转化。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+- `model_file(str)` - 模型文件路径
+- `param_file(str)` - 模型文件路径
+- `type(str)` - 模型格式（`naive_buffer/protobuf`)
+- `valid_places(str)` - 可用place列表，不同place用`,`隔开
+- `optimized_model_name(str)`
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+# 1. 创建opt实例
+opt=Opt()
+# 2. 执行模型优化
+opt.run_optimize("./mobilenet_v1","","","arm","mobilenetv1_opt");
+```
--- a/docs/api_reference/python_api_doc.md
+++ b/docs/api_reference/python_api_doc.md
+# Python API
+
+
+### [create_paddle_predictor](./python_api/create_paddle_predictor)
+
+创建预测执行器[`CxxPredictor`](./python_api/CxxPredictor)或者[`LightPredictor`](./python_api/LightPredictor)
+
+### [Opt](./python_api/opt)
+
+```python
+class Opt;
+```
+
+`Opt`模型离线优化接口，Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。
+
+### [CxxConfig](./python_api/CxxConfig)
+```python
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+
+### [MobileConfig](./python_api/MobileConfig)
+
+```python
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建LightPredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+
+### [CxxPredictor](./python_api/CxxPredictor)
+
+```python
+class CxxPredictor
+```
+
+`CxxPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+
+
+### [TargetType 、PrecisionType、DataLayoutType、Place](./python_api/TypePlace)
+
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+
+
+
+### [PowerMode](./python_api/PowerMode)
+
+```python
+class PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+
+
+### [Tensor](./python_api/Tensor)
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。*
--- a/docs/benchmark/benchmark.md
+++ b/docs/benchmark/benchmark.md
-# Benchmark 数据
+# 性能数据

 可以参考[benchmark_tools](benchmark_tools)，推荐**一键benchmark**。

@@ -15,14 +15,12 @@
    * int8模型
        * mobilenet_v1
        * mobilenet_v2
-        * resnet50

 * 测试机器(android ndk ndk-r17c)
   *  骁龙855
      * xiaomi mi9, snapdragon 855 
      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz

-
   *  骁龙845
      * xiaomi mi8, 845
      * 2.8GHz（大四核），1.7GHz（小四核）
@@ -30,20 +28,12 @@
   *  骁龙835
      * xiaomi mix2, snapdragon 835
      * 2.45GHz（大四核），1.9GHz（小四核）
- 
-   *  骁龙625
-      * oppo R9s, snapdragon625
-      * A53 x 8, big core@2.0GHz
- 
-   * 骁龙653
-      * 360 N5, snapdragon 653
-      * 4 x A73@2.0GHz + 4 x A53@1.4GHz
- 
+
   * 麒麟970
      * HUAWEI Mate10
 
 * 测试说明
-    * branch: release/2.0.0
+    * branch: release/v2.3.0
    * warmup=10, repeats=30，统计平均时间，单位是ms
    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
@@ -55,78 +45,59 @@

 #### paddlepaddle model

-
 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 
-mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 
-shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 
-squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 
-mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
+mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 |
+mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 |
+shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 |
+squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 |
+mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 |


-
-骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 
-mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 
-shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 
-squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 
-mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 
-
+mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 |
+mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 |
+shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 |
+squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 |
+mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 |

-麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
----| ---- | ---- | ---- | ----  |----  |----
-threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 
-mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 
-shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 
-squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 
-mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 

-麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 
-mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 
-shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 
-squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 
-mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 
+mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 |
+mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 |
+shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 |
+squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 |
+mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 |

 #### caffe model

 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
-mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
-shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
-
-
-骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
----| ---- | ---- | ---- | ----  |----  |----
-threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
-mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
-shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
+mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 |
+mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 |
+shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 |


-麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
-mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
-shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
+mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 |
+mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 |
+shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 |


-麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
-mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
-shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
+mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 |
+mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 |
+shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 |

 #### int8量化模型测试数据

@@ -136,6 +107,7 @@ threads num|1 |2 |4 |1 |2 |4 |
 mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
 mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |

+
 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |

--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
-# Benchmark 测试方法
+# 测试方法

 本文将会介绍，在**Ubuntu:16.04交叉编译环境**下，用安卓手机在终端测试Paddle-Lite的性能，并介绍两种Benchmark方法：

@@ -28,63 +28,64 @@ List of devices attached
 执行以下命令，完成Benchmark：

 ```shell
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh
+# Test v2.6 branch
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.6/run_benchmark.sh
+sh run_benchmark.sh
+
+# Test v2.3 branch
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.3/run_benchmark.sh
 sh run_benchmark.sh
 ```

 该`run_benchmark.sh`脚本会：

-1. 下载模型，并上传手机：包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet；
+1. 下载模型，并上传手机：包含mobilenetv1、mobilenetv2、shufflenetv2、squeezenetv1.1、mnasnet、mobilenetv1_int8、mobilenetv2_int8；
 2. 下载pre-built android-armv7和android-armv8的可执行文件，并上传手机：`benchmark_bin_v7`和`benchmark_bin_v8`；
 3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
 4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。

 ## 二. 逐步Benchmark

-### 1. 获取benchmark可执行文件
-
-benchmark_bin文件可以测试PaddleLite的性能，有下面两种方式获得。
-
-#### 方式一：下载benchmark_bin可执行文件
-
-```shell
-# Download benchmark_bin for android-armv7
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7
-
-# Download benchmark_bin for android-armv8
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8
-```
-
-#### 方式二：由源码编译benchmark_bin文件
+### 1. 编译benchmark可执行文件

-根据[源码编译](../source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+根据[源码编译](../user_guides/source_compile)准备编译环境，拉取PaddleLite最新特定分支代码，并在仓库根目录下，执行：

 ```shell
 ###########################################
 # Build benchmark_bin for android-armv7   #
 ###########################################
-./lite/tools/ci_build.sh  \
-  --arm_os="android" \
-  --arm_abi="armv7" \
-  --arm_lang="gcc " \
-  build_arm
+
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv7 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=ON \
+  --with_log=OFF \
+  full_publish

 # `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin

 ###########################################
 # Build benchmark_bin for android-armv8   #
 ###########################################
-./lite/tools/ci_build.sh  \
-  --arm_os="android" \
-  --arm_abi="armv8" \
-  --arm_lang="gcc "  \
-  build_arm
+
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=ON \
+  --with_log=OFF \
+  full_publish

 # `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
 ```

 > **注意**：为了避免在docker内部访问不到手机的问题，建议编译得到benchmark_bin后退出到docker外面，并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下，按照下面步骤下载模型、拷贝脚本、测试。

+> **注意**：如果不是测试常见分类模型（单输入，输入shape是1x3x224x224），需要根据实际情况修改`/PaddleLite/lite/api/benchmark.cc`文件，然后编译得到可执行文件。
+
 ### 2. 准备模型

 PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)。
@@ -135,53 +136,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
 > 不同手机，不同版本，测试模型的性能数据不同。

 ```shell
-run benchmark armv7
+run benchmark armv8
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 159.8427 ms
-- mobilenet_v1          avg = 235.0072 ms
-- mobilenet_v2          avg = 173.0387 ms
-- shufflenet_v2         avg = 76.0040 ms
-- squeezenet_v11        avg = 164.2957 ms
+mnasnet                       min = 19.83500    max = 19.38500    average = 19.65503
+mobilenetv1                   min = 32.00600    max = 31.56900    average = 31.81983
+mobilenetv2                   min = 22.37900    max = 22.08700    average = 22.28623
+shufflenetv2                  min = 10.80400    max = 10.62900    average = 10.68890
+squeezenet                    min = 17.67400    max = 17.47900    average = 17.57677

 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 83.1287 ms
-- mobilenet_v1          avg = 121.6029 ms
-- mobilenet_v2          avg = 86.6175 ms
-- shufflenet_v2         avg = 41.5761 ms
-- squeezenet_v11        avg = 87.8678 ms
+mnasnet                       min = 11.85600    max = 11.72000    average = 11.77127
+mobilenetv1                   min = 18.75000    max = 18.64300    average = 18.70593
+mobilenetv2                   min = 14.05100    max = 13.59900    average = 13.71450
+shufflenetv2                  min = 6.67200     max = 6.58300     average = 6.63400
+squeezenet                    min = 12.07100    max = 11.33400    average = 11.41253

 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 73.3880 ms
-- mobilenet_v1          avg = 119.0739 ms
-- mobilenet_v2          avg = 85.3050 ms
-- shufflenet_v2         avg = 38.0762 ms
-- squeezenet_v11        avg = 64.2201 ms
+mnasnet                       min = 7.19300     max = 7.02600     average = 7.08480
+mobilenetv1                   min = 10.42000    max = 10.29100    average = 10.34267
+mobilenetv2                   min = 8.61900     max = 8.46900     average = 8.54707
+shufflenetv2                  min = 4.55200     max = 4.41900     average = 4.46477
+squeezenet                    min = 8.60000     max = 7.85200     average = 7.98407
 --------------------------------------

-run benchmark armv8
+run benchmark armv7
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 165.3073 ms
-- mobilenet_v1          avg = 306.0188 ms
-- mobilenet_v2          avg = 195.1884 ms
-- shufflenet_v2         avg = 99.3692 ms
-- squeezenet_v11        avg = 156.6971 ms
+mnasnet                       min = 20.98300    max = 20.81400    average = 20.92527
+mobilenetv1                   min = 33.19000    max = 32.81700    average = 33.08490
+mobilenetv2                   min = 25.91400    max = 25.61700    average = 25.73097
+shufflenetv2                  min = 11.14300    max = 10.97600    average = 11.06757
+squeezenet                    min = 19.31800    max = 19.20000    average = 19.26530

 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 90.2290 ms
-- mobilenet_v1          avg = 157.0007 ms
-- mobilenet_v2          avg = 118.1607 ms
-- shufflenet_v2         avg = 68.6804 ms
-- squeezenet_v11        avg = 91.3090 ms
+mnasnet                       min = 12.59900    max = 12.46600    average = 12.52207
+mobilenetv1                   min = 19.05800    max = 18.94700    average = 18.97897
+mobilenetv2                   min = 15.28400    max = 15.11300    average = 15.19843
+shufflenetv2                  min = 6.97000     max = 6.81400     average = 6.90863
+squeezenet                    min = 12.87900    max = 12.12900    average = 12.22530

 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 179.9730 ms
-- mobilenet_v1          avg = 204.0684 ms
-- mobilenet_v2          avg = 181.6486 ms
-- shufflenet_v2         avg = 123.2728 ms
-- squeezenet_v11        avg = 412.9046 ms
+mnasnet                       min = 7.31400     max = 7.12900     average = 7.20357
+mobilenetv1                   min = 11.44000    max = 10.86900    average = 10.94383
+mobilenetv2                   min = 9.14900     max = 9.03800     average = 9.09907
+shufflenetv2                  min = 4.60600     max = 4.49400     average = 4.53360
+squeezenet                    min = 8.27000     max = 8.10600     average = 8.19000
 --------------------------------------
 ```
--- a/docs/benchmark/index.rst
+++ b/docs/benchmark/index.rst
--- a/docs/demo_guides/android_app_demo.md
+++ b/docs/demo_guides/android_app_demo.md
+# Android Demo
+
+## 多种应用场景
+
+我们提供的Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)，其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。涵盖[人脸识别](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/face_detection_demo)、[人像分割](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/human_segmentation_demo)、[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)4个应用场景。
+
+### 1. 人脸识别
+
+人脸检测是Paddle-Lite提供的人像检测demo。在移动端上提供了高精度、实时的人脸检测能力，能处理基于人脸检测的业务场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="300" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face2.jpg"/></p>
+
+### 2. 人像分割
+
+人像分割是Paddle-Lite 提供的图像分割demo ，在移动端上提供了实时的人像分割能力，可以应用证件照自动抠图、面积测量、智能交通（标记车道和交通标志）等场景。  在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human2.jpg"/></p>
+
+### 3. 图像分类
+
+图像分类是Paddle-Lite 提供的图像处理demo ，在移动端上提供了实时的物体识别能力，可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
+
+### 4. 物体检测
+
+物体检测是Paddle-Lite 提供的图像识别demo ，在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控（是否有违规物体或行为）、工业质检（微小瑕疵的数量和位置）、医疗诊断（细胞计数、中药识别）等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
+
+## Android demo部署方法
+
+下面我们以 **目标检测示例（object_detection_demo)** 为例讲解如何部署。
+
+**目的**：将基于Paddle-Lite预测库的Android APP 部署到手机，实现物体检测
+
+**需要的环境**： Android Studio、Android手机（开启USB调试模式）、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+
+**部署步骤**：
+
+1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
+
+2、用Android Studio 打开object_detection_demo工程 （本步骤需要联网）。
+
+3、手机连接电脑，打开**USB调试**和**文件传输模式**，在Android Studio上连接自己的手机设备（手机需要开启允许从 USB安装软件权限）
+
+![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)
+
+4、按下 Run按钮，自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型，需要联网)
+
+成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记
+
+<p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>
+
+## Android demo结构讲解
+
+Android 示例的代码结构如下图所示：
+
+<p align="center"><img width="600" height="450"  src="http://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_struct.png"/>
+
+
+   1、 Predictor.java： 预测代码
+
+```shell
+# 位置：
+object_detection_demo/app/src/main/java/com/baidu/paddle/lite/demo/object_detection/Predictor.java
+```
+
+  2、 model.nb : 模型文件 (opt 工具转化后Paddle-Lite模型)；pascalvoc_label_list：训练模型时的`labels`文件
+
+```shell
+# 位置：
+object_detection_demo/app/src/main/assets/models/ssd_mobilenet_v1_pascalvoc_for_cpu/model.nb
+object_detection_demo/app/src/main/assets/labels/pascalvoc_label_list
+```
+
+  3、 libpaddle_lite_jni.so、PaddlePredictor.jar：Paddle-Lite Java 预测库与Jar包 
+
+```shell
+# 位置
+object_detection_demo/app/src/main/jniLibs/arm64-v8a/libpaddle_lite_jni.so
+object_detection_demo/app/libs/PaddlePredictor.jar
+```
+
+  4、 build.gradle : 定义编译过程的 gradle 脚本。（不用改动，定义了自动下载Paddle-Lite预测和模型的过程）
+
+```shell
+# 位置
+object_detection_demo/app/build.gradle
+```
+
+
+
+## 代码讲解 （使用Paddle-Lite Java API 执行预测）
+
+Android 示例基于Java API 开发，调用Paddle-Lite Java API包括以下五步。更详细的API 描述参考： [Paddle-Lite Java API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。
+
+```c++
+// 导入Java API
+import com.baidu.paddle.lite.MobileConfig;
+import com.baidu.paddle.lite.Tensor;
+import com.baidu.paddle.lite.Predictor;
+import com.baidu.paddle.lite.PowerMode;
+
+// 1. 写入配置：设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelFromFile(<modelPath>); // 设置Paddle-Lite模型路径
+config.setPowerMode(PowerMode.LITE_POWER_NO_BIND); // 设置CPU运行模式
+config.setThreads(4); // 设置工作线程数
+
+// 2. 创建 PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 3. 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+Tensor input = predictor.getInput(0);
+input.resize(dims);
+input.setData(inputBuffer);
+
+// 4. 执行预测
+predictor.run();
+
+// 5. 获取输出数据
+Tensor result = predictor.getOutput(0);
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
--- a/docs/demo_guides/baidu_xpu.md
+++ b/docs/demo_guides/baidu_xpu.md
+# PaddleLite使用百度XPU预测部署
+
+Paddle Lite已支持百度XPU在x86和arm服务器（例如飞腾 FT-2000+/64）上进行预测部署。
+目前支持Kernel和子图两种接入方式，其中子图接入方式与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成XTCL组网API进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- 昆仑818-100（推理芯片）
+- 昆仑818-300（训练芯片）
+
+### 已支持的设备
+
+- K100/K200昆仑AI加速卡
+
+### 已支持的Paddle模型
+
+- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
+- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
+- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
+- YOLOv3
+- Mask R-CNN
+- Faster R-CNN
+- UNet
+- SENet
+- SSD
+- 百度内部业务模型（由于涉密，不方便透露具体细节）
+
+### 已支持（或部分支持）的Paddle算子（Kernel接入方式）
+
+- scale
+- relu
+- tanh
+- sigmoid
+- stack
+- matmul
+- pool2d
+- slice
+- lookup_table
+- elementwise_add
+- elementwise_sub
+- cast
+- batch_norm
+- mul
+- layer_norm
+- softmax
+- conv2d
+- io_copy
+- io_copy_once
+- __xpu__fc
+- __xpu__multi_encoder
+- __xpu__resnet50
+- __xpu__embedding_with_eltwise_add
+
+### 已支持（或部分支持）的Paddle算子（子图/XTCL接入方式）
+
+- relu
+- tanh
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- pool2d
+- softmax
+- mul
+- batch_norm
+- stack
+- gather
+- scale
+- lookup_table
+- slice
+- transpose
+- transpose2
+- reshape
+- reshape2
+- layer_norm
+- gelu
+- dropout
+- matmul
+- cast
+- yolo_box
+
+
+## 参考示例演示
+
+### 测试设备(K100昆仑AI加速卡)
+
+![baidu_xpu](https://paddlelite-demo.bj.bcebos.com/devices/baidu/baidu_xpu.jpg)
+
+### 准备设备环境
+
+- K100/200昆仑AI加速卡[规格说明书](https://paddlelite-demo.bj.bcebos.com/devices/baidu/K100_K200_spec.pdf)，如需更详细的规格说明书或购买产品，请联系欧阳剑ouyangjian@baidu.com；
+- K100为全长半高PCI-E卡，K200为全长全高PCI-E卡，要求使用PCI-E x16插槽，且需要单独的8针供电线进行供电；
+- 安装K100/K200驱动，目前支持Ubuntu和CentOS系统，由于驱动依赖Linux kernel版本，请正确安装对应版本的驱动安装包。
+
+### 准备本地编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Linux开发环境进行配置；
+- 由于编译示例程序需要依赖OpenCV和CMake 3.10.3，请执行如下命令进行安装；
+
+```shell
+$ sudo apt-get update
+$ sudo apt-get install gcc g++ make wget unzip libopencv-dev pkg-config
+$ wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
+$ tar -zxvf cmake-3.10.3.tar.gz
+$ cd cmake-3.10.3
+$ ./configure
+$ make
+$ sudo make install
+```
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-linux-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - resnet50_fp32_224_fluid # Paddle fluid non-combined格式的resnet50 float32模型
+          - __model__ # Paddle fluid模型组网文件，可拖入https://lutzroeder.github.io/netron/进行可视化显示网络结构
+          - bn2a_branch1_mean # Paddle fluid模型参数文件
+          - bn2a_branch1_scale
+          ...
+    - shell
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的，适用于amd64的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+  - libs
+    - PaddleLite
+      - amd64
+        - include # PaddleLite头文件
+        - lib
+          - libiomp5.so # Intel OpenMP库
+          - libmklml_intel.so # Intel MKL库
+          - libxpuapi.so # XPU API库，提供设备管理和算子实现。
+          - llibxpurt.so # XPU runtime库
+          - libpaddle_full_api_shared.so # 预编译PaddleLite full api库
+      - arm64
+        - include # PaddleLite头文件
+        - lib
+          - libxpuapi.so # XPU API库，提供设备管理和算子实现。
+          - llibxpurt.so # XPU runtime库
+          - libpaddle_full_api_shared.so # 预编译PaddleLite full api库
+```
+
+- 进入PaddleLite-linux-demo/image_classification_demo/shell，直接执行./run.sh amd64即可；
+
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./run.sh amd64 # 默认已生成amd64版本的build/image_classification_demo，因此，无需重新编译示例程序就可以执行。
+$ ./run.sh arm64 # 需要在arm64(FT-2000+/64)服务器上执行./build.sh arm64后才能执行该命令。
+...
+AUTOTUNE:(12758016, 16, 1, 2048, 7, 7, 512, 1, 1, 1, 1, 0, 0, 0) = 1by1_bsp(1, 32, 128, 128)
+Find Best Result in 150 choices, avg-conv-op-time = 40 us
+[INFO][XPUAPI][/home/qa_work/xpu_workspace/xpu_build_dailyjob/api_root/baidu/xpu/api/src/wrapper/conv.cpp:274] Start Tuning: (12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0)
+AUTOTUNE:(12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0) = wpinned_bsp(1, 171, 16, 128)
+Find Best Result in 144 choices, avg-conv-op-time = 79 us
+I0502 22:34:18.176113 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+I0502 22:34:18.176406 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.176697 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 0 cost: 2.116000 ms
+I0502 22:34:18.178530 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.178792 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 1 cost: 2.101000 ms
+I0502 22:34:18.180634 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.180881 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 2 cost: 2.089000 ms
+I0502 22:34:18.182726 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.182976 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 3 cost: 2.085000 ms
+I0502 22:34:18.184814 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.185068 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 4 cost: 2.101000 ms
+warmup: 1 repeat: 5, average: 2.098400 ms, max: 2.116000 ms, min: 2.085000 ms
+results: 3
+Top0  tabby, tabby cat - 0.689418
+Top1  tiger cat - 0.190557
+Top2  Egyptian cat - 0.112354
+Preprocess time: 1.553000 ms
+Prediction time: 2.098400 ms
+Postprocess time: 0.081000 ms
+```
+
+- 如果需要更改测试图片，可将图片拷贝到PaddleLite-linux-demo/image_classification_demo/assets/images目录下，然后将run.sh的IMAGE_NAME设置成指定文件名即可；
+- 如果需要重新编译示例程序，直接运行./build.sh amd64或./build.sh arm64即可。
+
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./build.sh amd64 # For amd64
+$ ./build.sh arm64 # For arm64(FT-2000+/64) 
+```
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到ResNet50 float32模型[resnet50_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)；
+- 由于XPU一般部署在Server端，因此将使用PaddleLite的full api加载原始的Paddle Fluid模型进行预测，即采用CXXConfig配置相关参数。
+
+### 更新支持百度XPU的Paddle Lite库
+
+- 下载PaddleLite源码；
+
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+```
+
+- 下载xpu_toolchain for amd64 or arm64(FT-2000+/64)；
+
+```shell
+$ wget <URL_to_download_xpu_toolchain>
+$ tar -xvf output.tar.gz
+$ mv output xpu_toolchain
+```
+
+- 编译full_publish for amd64 or arm64(FT-2000+/64)；
+
+```shell
+For amd64，如果报找不到cxx11::符号的编译错误，请将gcc切换到4.8版本。
+$ ./lite/tools/build.sh --build_xpu=ON --xpu_sdk_root=./xpu_toolchain x86
+
+For arm64(FT-2000+/64)
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --build_xpu=ON --xpu_sdk_root=./xpu_toolchain --with_log=ON full_publish
+```
+
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录；
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。
+
+## 其它说明
+
+- 如需更进一步的了解相关产品的信息，请联系欧阳剑ouyangjian@baidu.com；
+- 百度昆仑的研发同学正在持续适配更多的Paddle算子，以便支持更多的Paddle模型。
--- a/docs/demo_guides/cpp_demo.md
+++ b/docs/demo_guides/cpp_demo.md
+# C++ Demo
+
+## 1. 下载最新版本预测库
+
+预测库下载界面位于[Paddle-Lite官方预编译库](../user_guides/release_lib)，可根据需求选择合适版本。
+
+以**Android-ARMv8架构**为例，可以下载以下版本：
+
+
+|ARM Version|build_extra|arm_stl|target|下载|
+|:-------:|:-----:|:-----:|:-----:|:-------:|
+|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
+
+**解压后内容如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/1inference_lib.png)
+
+## 2. 转化模型
+
+PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。
+
+以`mobilenet_v1`模型为例：
+
+（1）下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压：
+
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxf mobilenet_v1.tar.gz
+```
+
+**如下图所示:**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png)
+
+（2）下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型：
+
+```shell
+wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
+chmod +x opt
+./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+```
+
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png)
+
+
+
+## 3. 编写预测程序
+
+准备好预测库和模型，我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考，位于`inference_lite_lib.android.armv8/demo/cxx`。
+
+以mobile net_v1预测为例：`mobile_light`为mobilenet_v1预测示例，可以直接调用。
+
+**示例如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/4light_demo.png)
+
+
+
+## 4. 编译
+
+预测程序需要编译为Android可执行文件。
+
+以mobilenet_v1模型为例，C++示例位于`inference_lite_lib.android.armv8/demo/mobile_light`
+
+```shell
+cd inference_lite_lib.android.armv8/demo/mobile_light
+```
+
+编译demo
+
+```shell
+make
+```
+
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/5compile_demo.png)
+
+## 5. 执行预测
+
+通过adb工具将可执行文件推送到手机上执行预测
+
+（1）保证电脑已经安装adb工具，手机以"USB调试"、"文件传输模式"连接到电脑。
+
+``` shell
+adb deveices   #查看adb设备是否已被识别
+```
+
+**连接如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/6adb_devices.png)
+
+（2）准备预测库、模型和预测文件
+
+1、将模型、动态库和预测文件放入同一文件夹：
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/7files.png)
+
+**注意**：动态预测库文件位于: `inference_lite_lib.android.armv8/cxx/liblibpaddle_light_api_shared.so`
+
+2、文件推送到手机：
+
+``` shell
+chmod +x mobilenetv1_light_api
+adb push mobilenet_v1_opt.nb /data/local/tmp
+adb push libpaddle_light_api_shared.so /data/local/tmp
+adb push mobilenetv1_light_api /data/local/tmp
+```
+**效果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/8push_file.png)
+
+（3）执行预测
+
+```shell
+adb shell 'cd /data/local/tmp && export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp && mobilenetv1_light_api ./mobilenet_v1_opt.nb'
+```
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/9result.png)
+
+上图的`Output`为mobilenet_v1模型在全1输入时，得到的预测输出。至此，Paddle-Lite的C++ demo执行完毕。
+
+
+
+
+
+## 注：如何在代码中使用 API
+
+C++代码调用Paddle-Lite执行预测库仅需以下五步：
+
+（1）引用头文件和命名空间
+
+```c++
+#include "paddle_api.h"
+using namespace paddle::lite_api;
+```
+
+（2）指定模型文件，创建Predictor
+
+```C++
+// 1. Set MobileConfig, model_file_path is 
+// the path to model model file. 
+MobileConfig config;
+config.set_model_from_file(model_file_path);
+// 2. Create PaddlePredictor by MobileConfig
+std::shared_ptr<PaddlePredictor> predictor =
+    CreatePaddlePredictor<MobileConfig>(config);
+```
+
+（3）设置模型输入 (下面以全一输入为例)
+
+```c++
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+```
+
+（4）执行预测
+
+```c++
+predictor->Run();
+```
+
+（5）获得预测结果
+
+```c++
+std::unique_ptr<const Tensor> output_tensor(
+    std::move(predictor->GetOutput(0)));
+// 转化为数据
+auto output_data=output_tensor->data<float>();
+```
+
+
+
+
+
+## 其他cxx_demo的编译与预期结果
+
+### Light API Demo
+
+```shell
+cd ../mobile_light
+make
+adb push mobilenetv1_light_api /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt  "
+```
+
+
+### 图像分类 Demo
+
+```shell
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+
+### 目标检测 Demo
+
+```shell
+cd ../mobile_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb push mobile_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_detection_result.jpg ./
+```
+
+### light API Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前10个类别的预测概率：
+
+```shell
+Output dim: 1000
+Output[0]: 0.000191
+Output[100]: 0.000160
+Output[200]: 0.000264
+Output[300]: 0.000211
+Output[400]: 0.001032
+Output[500]: 0.000110
+Output[600]: 0.004829
+Output[700]: 0.001845
+Output[800]: 0.000202
+Output[900]: 0.000586
+```
+
+### 图像分类 Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率：
+
+```shell
+parameter:  model_dir, image_path and label_file are necessary
+parameter:  topk, input_width,  input_height, are optional
+i: 0, index: 285, name:  Egyptian cat, score: 0.482870
+i: 1, index: 281, name:  tabby, tabby cat, score: 0.471593
+i: 2, index: 282, name:  tiger cat, score: 0.039779
+i: 3, index: 287, name:  lynx, catamount, score: 0.002430
+i: 4, index: 722, name:  ping-pong ball, score: 0.000508
+```
+
+### 目标检测 Demo 运行结果
+
+运行成功后 ，将在控制台输出检测目标的类型、预测概率和坐标：
+
+```shell
+running result:
+detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592
+detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597
+```
--- a/docs/user_guides/cuda.md
+++ b/docs/user_guides/cuda.md
-# Lite基于CUDA的模型预测
+# PaddleLite使用CUDA预测部署

 Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。

@@ -28,7 +28,27 @@ cd Paddle-Lite
 ./lite/tools/build.sh --build_python=ON cuda
 ```

-编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+## 编译结果说明
+
+cuda的编译结果位于 `build_cuda/inference_lite_lib`
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件，目前为空
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+
+3、 `third_party` 文件夹：第三方库文件
+
+4、 `demo` 文件夹：c++ demo.
+
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。

 ## 运行

@@ -36,7 +56,6 @@ cd Paddle-Lite

 一： 下载darknet_yolov3模型，模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)

-
 ```
 # 下载模型
 wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
@@ -47,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg

 二： 运行   

-**NOTE:**此处示例使用的是python接口，后续会开放C++接口以及示例。
+**NOTE：** 此处示例使用的是python接口。

 ``` python
 #-*- coding: utf-8 -*-
@@ -56,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *

 def read_img(im_path, resize_h, resize_w):
  im = cv2.imread(im_path).astype('float32')
@@ -107,4 +126,14 @@ print (output_tensor.float_data()[:6])

 ```

-**NOTE：** 对CUDA的支持还在持续开发中。
+**NOTE：** 此处示例使用的是C++接口。
+
+```
+cd build_cuda/inference_lite_lib/demo/cxx/
+mkdir build && cd build
+cmake ..
+make
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
+tar -zxf yolov3_infer.tar.gz
+./demo yolov3_infer
+```
--- a/docs/user_guides/fpga.md
+++ b/docs/user_guides/fpga.md
-# Lite基于FPGA的模型预测
+# PaddleLite使用FPGA预测部署

 Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测，提供armv8的交叉编译

@@ -22,7 +22,7 @@ CMAKE编译选项：

 - 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON`

-其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile)。
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../user_guides/source_compile)。
 示例如下：
 ```shell
    cmake .. \

--- a/docs/demo_guides/ios_app_demo.md
+++ b/docs/demo_guides/ios_app_demo.md
+# iOS Demo
+
+## 多种应用场景
+
+我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)，其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。iOS demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。
+
+### 1. 图像分类
+
+图像分类是Paddle-Lite 提供的图像处理demo ，在移动端上提供了实时的物体识别能力，可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
+
+### 2. 物体检测
+
+物体检测是Paddle-Lite 提供的图像识别demo ，在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控（是否有违规物体或行为）、工业质检（微小瑕疵的数量和位置）、医疗诊断（细胞计数、中药识别）等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
+
+## iOS demo部署方法
+
+下面我们以**目标检测（object_detection_demo)**为例讲解如何部署iOS工程。
+
+**目的**：将基于Paddle-Lite预测库的iOS APP部署到苹果手机，实现物体检测。
+
+**需要的环境**：Mac 电脑上安装Xcode、苹果手机、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+
+**部署步骤**：
+
+1、 目标检测的iOS示例位于 `Paddle-Lite-Demo\PaddleLite-ios-demo\object_detection_demo`
+
+2、终端中执行 `download_dependencies.sh`脚本自动下载模型和Paddle-Lite预测库
+
+```shell
+cd PaddleLite-ios-demo          # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-ios-demo
+sh download_dependencies.sh     # 2. 执行脚本下载依赖项 （需要联网）
+```
+
+下载完成后会出现提示： `Extract done `
+
+3、用Xcode打开`object_detection_demo/detection_demo.xcodeproj`文件，修改工程配置。
+依次修改 `General/Identity`和`Signing&Capabilities`属性，替换为自己的工程代号和团队名称。（必须修改，不然无法通过编译）
+
+![Xcode1](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode1.png)
+
+
+
+![Xcode2](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode2.png)
+
+4、 IPhone手机连接电脑，在Xcode中连接自己的手机 （第一次连接IPhone到电脑时，需要在IPhone的`设置->通用->设备管理`中选择本电脑并信任）
+
+<p align="center"><img width="600" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode-phone.jpg"/>
+
+5、按下左上角的 Run按钮，自动编译APP并安装到手机。在苹果手机中设置信任该APP（进入`设置->通用->设备管理`，选中新安装的APP并`验证该应用`）
+
+成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记
+
+<p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS2.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS3.jpeg"/></p>
+
+## iOS demo结构讲解
+
+iOS 示例的代码结构如下图所示：
+
+<p align="center"><img width="600" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS-struct.png"/>
+
+   1、 mobilenetv1-ssd： 模型文件 (opt 工具转化后Paddle-Lite模型)
+
+```shell
+# 位置：
+ios-detection_demo/detection_demo/models/mobilenetv1-ssd
+```
+
+  2、 libpaddle_api_light_bundled.a、paddle_api.h : Paddle-Lite C++ 预测库和头文件
+
+```shell
+# 位置：
+# iOS预测库
+ios-detection_demo/detection_demo/lib/libpaddle_api_light_bundled.a
+# 预测库头文件
+ios-detection_demo/detection_demo/include/paddle_api.h
+ios-detection_demo/detection_demo/include/paddle_use_kernels.h
+ios-detection_demo/detection_demo/include/paddle_use_ops.h
+```
+
+  3、 ViewController.mm：主要预测代码
+
+```shell
+# 位置
+ios-detection_demo/detection_demo/ViewController.mm
+```
+
+## 代码讲解 （如何使用Paddle-Lite C++ API 执行预测）
+
+IOS 示例基于C++ API 开发，调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考： [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。
+
+```c++
+#include <iostream>
+// 引入C++ API
+#include "paddle_lite/paddle_api.h"
+#include "paddle_lite/paddle_use_ops.h"
+#include "paddle_lite/paddle_use_kernels.h"
+
+// 1. 设置MobileConfig
+MobileConfig config;
+config.set_model_from_file(<modelPath>); // 设置NaiveBuffer格式模型路径
+config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式
+config.set_threads(4); // 设置工作线程数
+
+// 2. 创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 3. 设置输入数据
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 4. 执行预测
+predictor->run();
+
+// 5. 获取输出数据
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+            << std::endl;
+}
+```
--- a/docs/user_guides/java_demo.md
+++ b/docs/user_guides/java_demo.md
@@ -9,7 +9,7 @@

 ## 编译

-首先在PaddleLite的开发 [Docker镜像](../source_compile) 中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
+首先在PaddleLite的开发 [Docker镜像](../user_guides/source_compile) 中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
 下面我们以arm8 架构举例。进入paddlelite 目录，运行以下命令：

 ```shell
@@ -73,7 +73,7 @@ resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_o

 下载完后，assets文件夹里要包含解压后的上面五个模型文件夹，但demo里不需要保存原压缩.tar.gz 文件。

-注意：输入的模型要求为naive buffer存储格式，您可以通过 [**Model Optimize Tool**](../model_optimize_tool) 将fluid模型转为naive buffer存储格式。
+注意：输入的模型要求为naive buffer存储格式，您可以通过 [**Model Optimize Tool**](../user_guides/model_optimize_tool) 将fluid模型转为naive buffer存储格式。

 ## 运行 Android 程序结果


--- a/docs/demo_guides/mediatek_apu.md
+++ b/docs/demo_guides/mediatek_apu.md
+# PaddleLite使用MTK APU预测部署
+
+Paddle Lite已支持MTK APU的预测部署。
+其接入原理是与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成MTK的Neuron adapter API（类似Android NN API）进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- [MT8168](https://www.mediatek.cn/products/tablets/mt8168)/[MT8175](https://www.mediatek.cn/products/tablets/mt8175)及其他智能芯片。
+
+### 已支持的设备
+
+- MT8168-P2V1 Tablet。
+
+### 已支持的Paddle模型
+
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)
+
+### 已支持（或部分支持）的Paddle算子
+
+- relu
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- elementwise_mul
+- fc
+- pool2d
+- softmax
+
+## 参考示例演示
+
+### 测试设备(MT8168-P2V1 Tablet)
+
+![mt8168_p2v1_tablet_front](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_front.jpg)
+
+![mt8168_p2v1_tablet_back](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_back.jpg)
+
+### 准备设备环境
+
+- 由于需要依赖特定版本的firmware，感兴趣的同学通过MTK官网[https://www.mediatek.cn/about/contact-us](https://www.mediatek.cn/about/contact-us)提供的联系方式（类别请选择"销售"），获取测试设备和firmware；
+
+### 准备交叉编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-android-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
+        - mobilenet_v1_int8_224_for_apu.nb # 已通过opt转好的、适合mtk apu的mobilenetv1量化模型
+    - shell # android shell端的示例程序
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的android shell端的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+    - apk # 常规android应用程序
+      - app
+        - src
+          - main
+            - java # java层代码
+            - cpp # 自定义的jni实现
+        - app.iml
+        - build.gradle
+      - gradle
+      ...
+  - libs
+    - PaddleLite
+      - arm64-v8a
+        - include # PaddleLite头文件
+        - lib
+          - libc++_shared.so
+          - libpaddle_light_api_shared.so # 预编译PaddleLite库
+    - OpenCV # OpenCV 4.2 for android
+```
+
+- Android shell端的示例程序
+  - 进入PaddleLite-android-demo/image_classification_demo/shell，直接执行./run.sh即可，注意：run.sh不能在docker环境执行，否则可能无法找到设备；
+  - 如果需要更改测试图片，可将图片拷贝到PaddleLite-android-demo/image_classification_demo/assets/images目录下，然后将run.sh的IMAGE_NAME设置成指定文件名即可；
+  - 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行必须在docker环境中，否则可能编译出错；
+  - 需要说明的是，由于MTK APU暂时只支持NHWC的数据布局格式，而PaddleLite默认使用NCHW的数据布局格式，导致额外增加了预测中输入张量的NCHW到NHWC的转换，大约耗费8~9ms。
+```shell
+$ cd PaddleLite-android-demo/image_classification_demo/shell
+$ ./run.sh
+...
+warmup: 5 repeat: 10, average: 30.998502 ms, max: 31.049002 ms, min: 30.937002 ms
+results: 3
+Top0  Egyptian cat - -0.122845
+Top1  tabby, tabby cat - -0.122845
+Top2  tiger cat - -0.544028
+Preprocess time: 3.620000 ms
+Prediction time: 30.998502 ms
+Postprocess time: 0.069000 ms
+
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b00000, pa = 0xfb3f9000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af8000, pa = 0xfb3fa000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af7000, pa = 0xf8ffe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af6000, pa = 0xf7bfe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af5000, pa = 0xf7bfd000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0c000, pa = 0xfb3fe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0b000, pa = 0xfb3ff000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0a000, pa = 0xf31ff000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b09000, pa = 0xfb3f6000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b08000, pa = 0xf7bff000, len = 255
+```
+
+- 常规Android应用程序
+  - 安装Android Studio 3.4
+  - 打开Android Studio，在"Welcome to Android Studio"窗口点击"Open an existing Android Studio project"，在弹出的路径选择窗口中进入"PaddleLite-android-demo/image_classification_demo/apk"目录，然后点击右下角的"Open"按钮即可导入工程；
+  - 通过USB连接Android手机、平板或开发板；
+  - 临时关闭selinux模式，允许app调用系统库；
+```shell
+$ adb root
+# setenforce 0
+```
+  - 待工程加载完成后，点击菜单栏的Build->Rebuild Project按钮，如果提示CMake版本不匹配，请点击错误提示中的'Install CMake xxx.xxx.xx'按钮，重新安装CMake，然后再次点击菜单栏的Build->Rebuild Project按钮；
+  - 待工程编译完成后，点击菜单栏的Run->Run 'App'按钮，在弹出的"Select Deployment Target"窗口选择已经连接的Android设备，然后点击"OK"按钮；
+  - 等待大约1分钟后（第一次时间比较长，需要耐心等待），app已经安装到设备上。默认使用ARM CPU模型进行预测，由于MT8168的CPU由四核Arm-Cortex A53组成，性能较一般手机的A7x系列要弱很多，如下图所示，只有6fps；
+
+![mt8168_p2v1_tablet_cpu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_cpu.jpg)
+
+  - 点击app界面右下角的设置按钮，在弹出的设置页面点击"Choose pre-installed models"，选择"mobilenet_v1_int8_for_apu"，点击返回按钮后，app将切换到APU模型，如下图所示，帧率提高到14fps。
+
+![mt8168_p2v1_tablet_apu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_apu.jpg)
+
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)；
+- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化（注意：由于MTK APU只支持量化OP，在启动量化脚本时请注意相关参数的设置），最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)；
+- 参考[模型转化方法](../user_guides/model_optimize_tool)，利用opt工具转换生成MTK APU模型，仅需要将valid_targets设置为apu,arm即可。
+```shell
+$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
+    --optimize_out_type=naive_buffer \
+    --optimize_out=mobilenet_v1_int8_224_for_apu \
+    --valid_targets=apu,arm
+```
+- 注意：opt生成的模型只是标记了MTK APU支持的Paddle算子，并没有真正生成MTK APU模型，只有在执行时才会将标记的Paddle算子转成MTK Neuron adapter API调用实现组网，最终生成并执行模型。
+
+### 更新支持MTK APU的Paddle Lite库
+
+- 下载PaddleLite源码和APU DDK；
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+$ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
+$ tar -xvf apu_ddk.tar.gz
+```
+- 编译tiny_publish for MT8168-P2V1 Tablet
+```shell
+$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
+```
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录；
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
+
+
+## 其它说明
+
+- 由于涉及到License的问题，无法提供用于测试的firmware，我们深感抱歉。如果确实对此非常感兴趣，可以参照之前提到的联系方式，直接联系MTK的销售；
+- MTK研发同学正在持续增加用于适配Paddle算子bridge/converter，以便适配更多Paddle模型。
--- a/docs/advanced_user_guides/npu.md
+++ b/docs/advanced_user_guides/npu.md
-# 使用华为NPU
+# PaddleLite使用NPU(华为)预测部署

 Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭载的NPU）的预测框架。
 原理是在线分析Paddle模型，将Paddle算子转成HiAI IR后，调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
@@ -91,7 +91,7 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
 $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish
 ```

-注意：为了保证编译环境一致，建议参考[源码编译](../installation/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
+注意：为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。

 ## 优化生成NPU模型

@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
    --valid_targets=npu,arm \
-    --prefer_int8_kernel=(true|false) \
    --record_tailoring_info =(true|false)
 ```
 - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
@@ -111,19 +110,91 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an

 ## 通过JAVA接口加载并执行NPU模型

- 使用方法和[Java实例](../user_guides/java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
+**注意：由于华为手机root权限限制，现在仅支持JAVA接口加载和执行NPU模型**

-注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。
-
-## 通过C++接口加载并执行NPU模型
-
- 使用方法和[C++实例](../user_guides/cpp_demo)一致，同样无需额外设置任何参数，只需将模型换成NPU模型即可。
-
-注意：1）不能使用安卓模拟器，需要使用真实设备，且必须是支持NPU的华为手机。2）在使用adb push命令向手机推送目标程序时，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，推送到目标程序同级目录下。
+- 使用方法和[Java实例](java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。

+注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。

 ## 其它说明

 - 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
 - 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
 - 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
+
+
+## 手动分割子图
+
+### 背景
+- Paddle-Lite已经支持了大量的华为NPU的算子，但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型，Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图，实现NPU和CPU自动调度的功能，通常情况下可以获得比较好的性能。在一些特殊情况下，模型会被自动划分为比较多的子图，导致CPU和NPU的切换开销很大，从而导致整体性能变差。因此，需要手动分割子图的功能来指定一些算子跑在CPU上，避免子图过多。
+
+### 功能
+- 通过配置文件来指定需要强制跑在CPU上的算子
+
+### 使用方法
+- 1、通过netron打开paddle模型文件，可以查看模型结构，获得算子的类型、输入名称。输出名称。
+    - 注意：Paddle-Lite会对模型进行优化，模型算子可以改变，需要以优化后的模型算子为准。后面会举例说明。
+- 2、生成配置文件 ```split_cfg.txt```，记录需要跑在CPU上的算子信息。
+    - 每行一条OP记录信息，以冒号":"分隔"op名称"，"op输入名"，"op输出名"，以逗号","分隔"op输入名"和"op输出名"中的不同var名。
+    - 可以部分省略输入或者输出名。比如：```op3:in3_var0```表示，指定类型为"op3"，输入为"in3_var0"的算子；```op4```表示所有类型为"op4"的算子
+    - 例子1：
+    ```
+    op0:in0_var0,in0_var1:out0_var0,out0_var1
+    op1:in1_var0,in1_var1:out1_var0
+    op2::out2_var0
+    op3:in3_var0
+    op4
+    ```
+    - 例子2：
+    ```
+    transpose:conv2d_22.tmp_1:transpose_0.tmp_0
+    ```
+    ![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
+
+- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
+    - 例如：
+    ```
+    export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
+    ```
+- 4、以上步骤完成后，运行的模型中符合条件的算子将被强制跑在CPU上。
+
+### 举例
+- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
+
+- 1、可以使用netron查看模型
+
+- 2、初步分析
+
+    - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行，蓝色部分可能NPU上的性能不理想。此时，如果直接让预测库自动调度的话，可能会分成多个子图，而且整体性能不佳。因此，可以将蓝色部分和绿色部分整体指定在CPU上运行，让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
+    ![ssd_mobilenet_v1_example](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
+
+- 3、使用opt转换模型
+
+    - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
+    ![image](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
+    ![image](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
+    - 将从```digraph G```开始的，到```// end G```结束的整段模型图信息，保存到```.dot```格式的文件中。可以用```graphviz```打开查看，或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
+    ![image](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
+    - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在，如果被融合为了一个算子，需要指定此时融合后的算子)。
+
+- 4、写配置文件
+
+    - 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
+    ```
+    reshape
+    transpose
+    concat
+    softmax
+    ```
+    - 由于这些算子都指定在CPU上运行，因此不需要特意配置算子的输入输出名称。
+
+- 5、指定配置文件路径
+
+    - 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
+
+- 6、性能测试
+
+    - 设备：华为mate30 5G
+    - HIAI ddk版本：320
+    - 性能：CPU约71.8ms，NPU约16.6ms。
+    
--- a/docs/user_guides/opencl.md
+++ b/docs/user_guides/opencl.md
-# Lite基于OpenCL的ARM GPU预测
+# PaddleLite使用OpenCL预测部署

 Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环境下armv8、armv7的交叉编译。

-## 编译
+## 1. 编译

-### 编译环境
+### 1.1 编译环境

 1. Docker 容器环境；
 2. Linux（推荐 Ubuntu 16.04）环境。

 详见 **源码编译指南-环境准备** 章节。

-### 编译选项
+### 1.2 编译Paddle-Lite OpenCL库范例

-|参数|介绍|值|
-|--------|--------|--------|
-|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
-|--arm_abi|代表体系结构类型，支持armv8和armv7|默认为`armv8`即arm64-v8a；`armv7`即armeabi-v7a|
-|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc，支持 gcc和clang两种|
+注：以android/armv7/opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。

-### 编译Paddle-Lite OpenCL库范例
+#### 针对 Lite 用户的编译命令(无单元测试,有编译产物,适用于benchmark)

-注：以android-armv8-opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。
+- `with_opencl`: `[ON | OFF]`，编译OpenCL必选；
+- `arm_abi`: `[armv7 | armv8]`；
+- `toolchain`: `[gcc | clang]`；
+- `build_extra`: `[OFF | ON]`，编译全量op和kernel，包含控制流NLP相关的op和kernel体积会大，编译时间长；
+- `build_cv`: `[OFF | ON]`，编译arm cpu neon实现的的cv预处理模块；
+- `android_stl`: `[c++_shared | c++_static | gnu_static | gnu_shared]`，paddlelite的库以何种方式链接`android_stl`，选择`c++_shared`得到的动态库体积更小，但使用时候记得上传paddlelite所编译版本（armv7或armv8）一致的`libc++_shared.so`。默认使用`c++_static`。
+
+```bash
+######################################
+# 假设当前位于处于Lite源码根目录下   #
+######################################
+
+# 导入NDK_ROOT变量，注意检查NDK安装目录若与本示例是否不同
+export NDK_ROOT=/opt/android-ndk-r17c
+
+# 删除上一次CMake自动生成的.h文件
+rm ./lite/api/paddle_use_kernels.h
+rm ./lite/api/paddle_use_ops.h
+
+# 设置编译参数并开始编译
+./lite/tools/build_android.sh \
+  --arch=armv7 \
+  --toolchain=clang \
+  --with_cv=OFF \
+  --with_log=OFF \
+  --with_extra=OFF \
+  --with_opencl=ON
+
+# 注：编译帮助请执行: ./lite/tools/build_android.sh help
+```
+
+注：该方式的编译产物中的`demo/cxx/mobile_light`适用于做benchmark，该过程不会打印开发中加入的log，注意需要提前转好模型。关于使用，详见下文**运行示例1: 编译产物demo示例**。
+
+#### 针对 Lite 开发者的编译命令(有单元测试,编译产物)
+
+注：调用`./lite/tools/ci_build.sh`执行编译，该命令会编译armv7和armv8的opencl库。虽然有编译产物，但因编译单元测试，编译产物包体积可能较大，生产环境不推荐使用。

 ```bash
 # 假设当前位于处于Lite源码根目录下
@@ -38,16 +69,20 @@ rm ./lite/api/paddle_use_ops.h
  --arm_os=android \
  --arm_abi=armv8 \
  --arm_lang=gcc \
-  build_test_arm_opencl
+  build_opencl
 ```

-编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
+
+### 1.3 编译产物说明
+
+编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，根据编译参数不同，文件夹名字会略有不同。这里仅罗列关键产物：

 - `cxx`:该目录是编译目标的C++的头文件和库文件;
 - `demo`:该目录包含了两个demo，用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`，分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文
-  - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见代码注释;
-  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型;
- `opencl`:该目录存放opencl实现的相关kernel。
+  - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见该文件的代码注释;
+  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型。
+注：`opencl`实现的相关kernel已经打包到动态库中。

 ```bash
 .
@@ -65,126 +100,78 @@ rm ./lite/api/paddle_use_ops.h
 |       |-- libpaddle_api_light_bundled.a
 |       |-- libpaddle_full_api_shared.so
 |       `-- libpaddle_light_api_shared.so
-|-- demo
-|   `-- cxx
-|       |-- Makefile.def
-|       |-- README.md
-|       |-- include
-|       |   |-- paddle_api.h
-|       |   |-- paddle_lite_factory_helper.h
-|       |   |-- paddle_place.h
-|       |   |-- paddle_use_kernels.h
-|       |   |-- paddle_use_ops.h
-|       |   `-- paddle_use_passes.h
-|       |-- mobile_full
-|       |   |-- Makefile
-|       |   `-- mobilenetv1_full_api.cc
-|       `-- mobile_light
-|           |-- Makefile
-|           `-- mobilenetv1_light_api.cc
-`-- opencl
-    `-- cl_kernel
-        |-- buffer
-        |   |-- depthwise_conv2d_kernel.cl
-        |   |-- elementwise_add_kernel.cl
-        |   |-- fc_kernel.cl
-        |   |-- im2col_kernel.cl
-        |   |-- layout_kernel.cl
-        |   |-- mat_mul_kernel.cl
-        |   |-- pool_kernel.cl
-        |   `-- relu_kernel.cl
-        |-- cl_common.h
-        `-- image
-            |-- channel_add_kernel.cl
-            |-- elementwise_add_kernel.cl
-            |-- pool_kernel.cl
-            `-- relu_kernel.cl
+`-- demo
+    `-- cxx
+        |-- Makefile.def
+        |-- README.md
+        |-- include
+        |   |-- paddle_api.h
+        |   |-- paddle_lite_factory_helper.h
+        |   |-- paddle_place.h
+        |   |-- paddle_use_kernels.h
+        |   |-- paddle_use_ops.h
+        |   `-- paddle_use_passes.h
+        |-- mobile_full
+        |   |-- Makefile
+        |   `-- mobilenetv1_full_api.cc
+        `-- mobile_light
+            |-- Makefile
+            `-- mobilenetv1_light_api.cc
 ```

 调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。



-## 运行示例
+## 2. 运行示例

-下面以android、ARMv8、gcc的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
+下面以android的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。

+### 2.1 运行示例1: 编译产物demo示例和benchmark

-**注意：** 以下命令均在Lite源码根目录下运行。在3个示例前，下面这段命令都先要执行用来准备环境:
+需要提前用模型优化工具opt转好模型(下面假设已经转换好模型，且模型名为`mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb`)。编译脚本为前文**针对 Lite 用户的编译命令(无单元测试,有编译产物,适用于benchmark)**。

 ```bash
-# 在/data/local/tmp目录下创建OpenCL文件目录
-adb shell mkdir -p /data/local/tmp/opencl
-adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
-adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
-
-# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
-adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
-adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
-adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
+#################################
+# 假设当前位于build.xxx目录下   #
+#################################
+
+# prepare enviroment on phone
+adb shell mkdir -p /data/local/tmp/opencl/
+
+# build demo
+cd inference_lite_lib.android.armv7.opencl/demo/cxx/mobile_light/
+make
+cd -
+
+# push executable binary, library to device
+adb push inference_lite_lib.android.armv7.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
+adb shell chmod +x /data/local/tmp/opencl/mobilenetv1_light_api
+adb push inference_lite_lib.android.armv7.opencl/cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/opencl/
+
+# push model with optimized(opt) to device
+adb push ./mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb /data/local/tmp/opencl/
+
+# run demo on device
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/opencl/; \
+           /data/local/tmp/opencl/mobilenetv1_light_api \
+           /data/local/tmp/opencl/mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb \
+           1 3 224 224 \
+           100 10 0" # round=100, warmup=10, print_output_tensor=0
 ```

-### 运行示例1: 编译产物demo示例
+**注：** 权重参数会在第一次运行时加载，所以第一次执行时间略长。一般将warmup的值设为10，repeats值设为多次。

-```bash
-######################################################################
-# 编译mobile_full的demo                                              #
-######################################################################
-# 步骤:                                                              #
-#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
-#   1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏;  #
-#   2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo;     #
-#   3.上传demo, 模型, opencl kernel文件到手机;                       #
-#   4.运行demo得到预期结果.                                          #
-######################################################################
-adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
-chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
-adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
-adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
-
-# use mobile_full run mobilenet_v1
-# `GLOG_v` is log level
-adb shell "export GLOG_v=0; \
-    /data/local/tmp/opencl/mobilenetv1_full_api \
-    --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
-    --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
-
-
-
-######################################################################
-# 编译mobile_light的demo                                             #
-######################################################################
-# 步骤:                                                              #
-#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
-#   1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`;  #
-#   2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo;   #
-#   3.上传demo, 模型, opencl kernel文件到手机;                       #
-#   4.运行demo得到预期结果.                                          #
-######################################################################
-
-# use model_optimize_tool to optimize model
-./build.model_optimize_tool/lite/api/model_optimize_tool \
-  --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
-  --optimize_out_type=naive_buffer \
-  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
-  --valid_targets=opencl
-
-adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
-chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
-adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
-adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
-
-# use mobile_light run mobilenet_v1
-adb shell "export GLOG_v=5; \
-  /data/local/tmp/opencl/mobilenetv1_light_api \
-  --model_dir=/data/local/tmp/opencl/"
-```
+### 2.2 运行示例2: test_mobilenetv1单元测试

-### 运行示例2: test_mobilenetv1单元测试
+编译脚本为前文**针对 Lite 开发者的编译命令(有单元测试,编译产物)**。

 - **运行文件准备**

 ```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+
 # 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
 adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
 adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
@@ -195,47 +182,31 @@ adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/loc

 - **执行OpenCL推理过程**

-使用如下命令运行OpenCL程序。其中：
-
- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录；
- `--modle_dir`指定了模型文件所在目录。
-
 ```bash
 adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1

-adb shell /data/local/tmp/opencl/test_mobilenetv1 \
-  --cl_path=/data/local/tmp/opencl \
-  --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
-  --warmup=1 \
-  --repeats=1
+adb shell "export GLOG_v=1; \
+   /data/local/tmp/opencl/test_mobilenetv1 \
+  --model_dir=/data/local/tmp/opencl/mobilenetv1_fluid/ \
+  --warmup=10 \
+  --repeats=100"
 ```

-**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
-
-### 运行示例3: test_layout_opencl单元测试
+### 2.3 运行示例3: test_layout_opencl单元测试

- **运行文件准备**
+编译脚本为前文**针对 Lite 开发者的编译命令(有单元测试,编译产物)**。

 ```bash
-# 将OpenCL单元测试程序test_layout_opencl，推送到/data/local/tmp/opencl目录下
+adb shell mkdir -p /data/local/tmp/opencl
 adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
-```
-
-
-OpenCL推理过程**
-
-```bash
 adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
-adb shell /data/local/tmp/opencl/test_layout_opencl
+adb shell "export GLOG_v=4; \
+  /data/local/tmp/opencl/test_layout_opencl"
 ```

+## 3. 如何在Code中使用

-# 如何在Code中使用
-
-见运行示例1的demo代码:
-
-1. [./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
-2. [./lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc).
+即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);

 注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。


--- a/docs/demo_guides/rockchip_npu.md
+++ b/docs/demo_guides/rockchip_npu.md
+# PaddleLite使用RK NPU预测部署
+
+Paddle Lite已支持RK NPU的预测部署。
+其接入原理是与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成RK组网API进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- RK1808, RK1806，暂时不支持RK3399Pro。
+
+### 已支持的设备
+
+- RK1808/1806 EVB。
+
+### 已支持的Paddle模型
+
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)
+
+### 已支持（或部分支持）的Paddle算子
+
+- relu
+- conv2d
+- depthwise_conv2d
+- pool2d
+- fc
+- softmax
+- batch_norm
+- concat
+- elementwise_add
+- elementwise_sub
+- elementwise_mul
+- elementwise_div
+
+## 参考示例演示
+
+### 测试设备(RK1808 EVB)
+
+![rk1808_evb_front](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_front.jpg)
+
+![rk1808_evb_back](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_back.jpg)
+
+### 准备设备环境
+
+- 需要依赖特定版本的firmware，请参照[rknpu_ddk](https://github.com/airockchip/rknpu_ddk)的说明对设备进行firmware的更新；
+- 由于RK1808 EVB在刷firmware后，只是一个纯净的Linux系统，无法像Ubuntu那样使用apt-get命令方便的安装软件，因此，示例程序和PaddleLite库的编译均采用交叉编译方式；
+- 将MicroUSB线插入到设备的MicroUSB OTG口，就可以使用Android的adb命令进行设备的交互，再也不用配置网络使用ssh或者通过串口的方式访问设备了，这个设计非常赞！
+
+### 准备交叉编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-linux-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+        - tabby_cat.raw # 已处理成raw数据的测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
+        - mobilenet_v1_int8_224_for_rknpu.nb # 已通过opt转好的、适合rknpu的mobilenetv1量化模型
+    - shell
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+  - libs
+    - PaddleLite
+      - arm64
+        - include # PaddleLite头文件
+        - lib
+          - libGAL.so # RK DDK库
+          - libOpenVX.so
+          - libVSC.so
+          - librknpu_ddk.so
+          - libgomp.so.1 # gnuomp库
+          - libpaddle_light_api_shared.so # 预编译PaddleLite库
+      - armhf
+        - include # PaddleLite头文件
+        - lib
+          - libGAL.so
+          - libOpenVX.so
+          - libVSC.so
+          - librknpu_ddk.so
+          - libgomp.so.1
+          - libpaddle_light_api_shared.so
+```
+
+- 进入PaddleLite-linux-demo/image_classification_demo/shell，直接执行./run.sh arm64即可，注意：run.sh不能在docker环境执行，否则无法找到设备；
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./run.sh arm64 # For RK1808 EVB
+$ ./run.sh armhf # For RK1806 EVB 
+...
+warmup: 5 repeat: 10, average: 6.499500 ms, max: 6.554000 ms, min: 6.468000 ms
+results: 3
+Top0  Egyptian cat - 0.532328
+Top1  tabby, tabby cat - 0.345136
+Top2  tiger cat - 0.111146
+Preprocess time: 2.414000 ms
+Prediction time: 6.499500 ms
+Postprocess time: 0.414000 ms
+```
+- 如果需要更改测试图片，可通过convert_to_raw_image.py工具生成；
+- 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行必须在docker环境中，否则可能编译出错。
+
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)；
+- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化（注意：由于RK NPU只支持tensor-wise的全量化模型，在启动量化脚本时请注意相关参数的设置），最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)；
+- 参考[模型转化方法](../user_guides/model_optimize_tool)，利用opt工具转换生成RKNPU模型，仅需要将valid_targets设置为rknpu,arm即可。
+```shell
+$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
+    --optimize_out_type=naive_buffer \
+    --optimize_out=mobilenet_v1_int8_224_for_rknpu \
+    --valid_targets=rknpu,arm
+```
+- 注意：opt生成的模型只是标记了RKNPU支持的Paddle算子，并没有真正生成RK NPU模型，只有在执行时才会将标记的Paddle算子转成RK NPU组网API，最终生成并执行模型。
+
+### 更新支持RK NPU的Paddle Lite库
+
+- 下载PaddleLite源码和RK DDK；
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+$ git clone https://github.com/airockchip/rknpu_ddk.git
+```
+- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
+```shell
+For RK1808 EVB
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+
+For RK1806 EVB
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+```
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
+- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
+
+## 其它说明
+
+- RK研发同学正在持续增加用于适配Paddle算子bridge/converter，以便适配更多Paddle模型。
--- a/docs/demo_guides/x86.md
+++ b/docs/demo_guides/x86.md
--- a/docs/advanced_user_guides/add_layout.md
+++ b/docs/advanced_user_guides/add_layout.md
-# 如何增加Layout
+# 新增Layout

 Paddle-Lite中Place包含了Target、Layout、Precision信息，用来注册和选择模型中的具体Kernel。下面以增加Place中的layout：`ImageDefault`、`ImageFolder`、`ImageNW`为例，讲解如何增加新Layout。


--- a/docs/advanced_user_guides/add_new_pass.md
+++ b/docs/advanced_user_guides/add_new_pass.md
--- a/docs/advanced_user_guides/add_operation.md
+++ b/docs/advanced_user_guides/add_operation.md
--- a/docs/develop_guides/architecture-intro.md
+++ b/docs/develop_guides/architecture-intro.md
--- a/docs/develop_guides/for-developer.md
+++ b/docs/develop_guides/for-developer.md
--- a/docs/develop_guides/index.rst
+++ b/docs/develop_guides/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
--- a/docs/installation/library.md
+++ b/docs/installation/library.md
--- a/docs/introduction/faq.md
+++ b/docs/introduction/faq.md
--- a/docs/introduction/roadmap.md
+++ b/docs/introduction/roadmap.md
--- a/docs/introduction/support_hardware.md
+++ b/docs/introduction/support_hardware.md
--- a/docs/advanced_user_guides/support_operation_list.md
+++ b/docs/advanced_user_guides/support_operation_list.md
--- a/docs/user_guides/Compile/Android.md
+++ b/docs/user_guides/Compile/Android.md
--- a/docs/user_guides/Compile/Linux.md
+++ b/docs/user_guides/Compile/Linux.md
--- a/docs/user_guides/Compile/iOS.md
+++ b/docs/user_guides/Compile/iOS.md
--- a/docs/user_guides/Compile/v2.3_compile.md
+++ b/docs/user_guides/Compile/v2.3_compile.md
--- a/docs/user_guides/cpp_demo.md
+++ b/docs/user_guides/cpp_demo.md
--- a/docs/user_guides/debug.md
+++ b/docs/user_guides/debug.md
--- a/docs/user_guides/library.md
+++ b/docs/user_guides/library.md
--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
--- a/docs/user_guides/model_quantization.md
+++ b/docs/user_guides/model_quantization.md
--- a/docs/user_guides/opt/opt_bin.md
+++ b/docs/user_guides/opt/opt_bin.md
--- a/docs/user_guides/opt/opt_python.md
+++ b/docs/user_guides/opt/opt_python.md
--- a/docs/user_guides/opt/x2paddle&opt.md
+++ b/docs/user_guides/opt/x2paddle&opt.md
--- a/docs/user_guides/paddle_mobile.md
+++ b/docs/user_guides/paddle_mobile.md
--- a/docs/user_guides/post_quant_no_data.md
+++ b/docs/user_guides/post_quant_no_data.md
--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
--- a/docs/user_guides/release_lib.md
+++ b/docs/user_guides/release_lib.md
--- a/docs/installation/source_compile.md
+++ b/docs/installation/source_compile.md
--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
--- a/docs/user_guides/x2paddle.md
+++ b/docs/user_guides/x2paddle.md
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
--- a/lite/api/android/jni/native/paddle_lite_jni.h
+++ b/lite/api/android/jni/native/paddle_lite_jni.h
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
--- a/lite/api/apis_test.cc
+++ b/lite/api/apis_test.cc
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
--- a/lite/api/cxx_api_bin.cc
+++ b/lite/api/cxx_api_bin.cc
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
--- a/lite/api/ocr_attention_test.cc
+++ b/lite/api/ocr_attention_test.cc
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
--- a/lite/api/python/bin/paddle_lite_opt
+++ b/lite/api/python/bin/paddle_lite_opt
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
--- a/lite/api/python/setup_mac.py.in
+++ b/lite/api/python/setup_mac.py.in
--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
--- a/lite/api/test_googlenet_lite.cc
+++ b/lite/api/test_googlenet_lite.cc
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
--- a/lite/api/test_inceptionv4_lite_x86.cc
+++ b/lite/api/test_inceptionv4_lite_x86.cc
--- a/lite/api/test_mobilenetv1_lite_x86.cc
+++ b/lite/api/test_mobilenetv1_lite_x86.cc
--- a/lite/api/test_mobilenetv2_lite_x86.cc
+++ b/lite/api/test_mobilenetv2_lite_x86.cc
--- a/lite/api/test_resnet50_lite_x86.cc
+++ b/lite/api/test_resnet50_lite_x86.cc
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
--- a/lite/api/test_resnet50_lite_bm.cc
+++ b/lite/api/test_resnet50_lite_bm.cc
--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
--- a/lite/backends/apu/CMakeLists.txt
+++ b/lite/backends/apu/CMakeLists.txt
--- a/lite/backends/apu/device.cc
+++ b/lite/backends/apu/device.cc
--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
--- a/lite/backends/arm/math/argmax.cc
+++ b/lite/backends/arm/math/argmax.cc
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
--- a/lite/backends/arm/math/concat.h
+++ b/lite/backends/arm/math/concat.h
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ b/lite/backends/arm/math/gemv_arm_int8.cc
--- a/lite/backends/arm/math/increment.cc
+++ b/lite/backends/arm/math/increment.cc
--- a/lite/backends/arm/math/increment.h
+++ b/lite/backends/arm/math/increment.h
--- a/lite/backends/arm/math/layout.cc
+++ b/lite/backends/arm/math/layout.cc
--- a/lite/backends/arm/math/lstm.cc
+++ b/lite/backends/arm/math/lstm.cc
--- a/lite/backends/arm/math/lstm.h
+++ b/lite/backends/arm/math/lstm.h
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
--- a/lite/backends/arm/math/reduce_mean.cc
+++ b/lite/backends/arm/math/reduce_mean.cc
--- a/lite/backends/arm/math/reduce_mean.h
+++ b/lite/backends/arm/math/reduce_mean.h
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
--- a/lite/backends/arm/math/topk.cc
+++ b/lite/backends/arm/math/topk.cc
--- a/lite/backends/arm/math/topk.h
+++ b/lite/backends/arm/math/topk.h
--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
--- a/lite/backends/bm/target_wrapper.h
+++ b/lite/backends/bm/target_wrapper.h
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
--- a/lite/backends/cuda/context.cc
+++ b/lite/backends/cuda/context.cc
--- a/lite/backends/cuda/context.h
+++ b/lite/backends/cuda/context.h
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
--- a/lite/backends/cuda/math/batched_gemm.cc
+++ b/lite/backends/cuda/math/batched_gemm.cc
--- a/lite/backends/cuda/math/elementwise.cu
+++ b/lite/backends/cuda/math/elementwise.cu
--- a/lite/backends/cuda/math/elementwise.h
+++ b/lite/backends/cuda/math/elementwise.h
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
--- a/lite/backends/mlu/CMakeLists.txt
+++ b/lite/backends/mlu/CMakeLists.txt
--- a/lite/backends/mlu/mlu_utils.h
+++ b/lite/backends/mlu/mlu_utils.h
--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
--- a/lite/backends/opencl/cl_caller.h
+++ b/lite/backends/opencl/cl_caller.h
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
--- a/lite/backends/opencl/cl_half.cc
+++ b/lite/backends/opencl/cl_half.cc
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
--- a/lite/backends/opencl/cl_image.cc
+++ b/lite/backends/opencl/cl_image.cc
--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
--- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/api/light_api_shared.cc
+++ b/lite/api/light_api_shared.cc
--- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/slice_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
--- a/lite/backends/rknpu/CMakeLists.txt
+++ b/lite/backends/rknpu/CMakeLists.txt
--- a/lite/backends/rknpu/device.cc
+++ b/lite/backends/rknpu/device.cc
--- a/lite/backends/rknpu/device.h
+++ b/lite/backends/rknpu/device.h
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
--- a/lite/backends/x86/jit/gen/act.h
+++ b/lite/backends/x86/jit/gen/act.h
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
--- a/lite/backends/x86/jit/gen/gru.h
+++ b/lite/backends/x86/jit/gen/gru.h
--- a/lite/backends/x86/jit/gen/hopv.h
+++ b/lite/backends/x86/jit/gen/hopv.h
--- a/lite/backends/x86/jit/gen/lstm.h
+++ b/lite/backends/x86/jit/gen/lstm.h
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
--- a/lite/backends/x86/jit/gen/sgd.h
+++ b/lite/backends/x86/jit/gen/sgd.h
--- a/lite/backends/x86/jit/gen/vbroadcast.h
+++ b/lite/backends/x86/jit/gen/vbroadcast.h
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
--- a/lite/backends/xpu/math.h
+++ b/lite/backends/xpu/math.h
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
--- a/lite/backends/xpu/xpu_header_sitter.h
+++ b/lite/backends/xpu/xpu_header_sitter.h
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
--- a/lite/core/exported_symbols.lds
+++ b/lite/core/exported_symbols.lds
--- a/lite/core/kernel.cc
+++ b/lite/core/kernel.cc
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/core/lite.map
+++ b/lite/core/lite.map
--- a/lite/core/lite_tensor_test.cc
+++ b/lite/core/lite_tensor_test.cc
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
--- a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
--- a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
--- a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
--- a/lite/core/mir/fusion/elementwise_add_activation_fuser.h
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuser.h
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
--- a/lite/core/mir/fusion/interpolate_fuse_pass.cc
+++ b/lite/core/mir/fusion/interpolate_fuse_pass.cc
--- a/lite/core/mir/fusion/interpolate_fuser.cc
+++ b/lite/core/mir/fusion/interpolate_fuser.cc
--- a/lite/core/mir/fusion/interpolate_fuser.h
+++ b/lite/core/mir/fusion/interpolate_fuser.h
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
--- a/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/scale_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
--- a/lite/core/mir/fusion/scale_activation_fuser.cc
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
--- a/lite/core/mir/fusion/scale_activation_fuser.h
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
--- a/lite/core/mir/multi_stream_analysis_pass.cc
+++ b/lite/core/mir/multi_stream_analysis_pass.cc
--- a/lite/core/mir/multi_stream_analysis_pass.h
+++ b/lite/core/mir/multi_stream_analysis_pass.h
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
--- a/lite/core/mir/pass_registry.h
+++ b/lite/core/mir/pass_registry.h
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
--- a/lite/core/mir/quantized_op_attributes_inference_pass.h
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.h
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
--- a/lite/core/mir/type_layout_cast_pass.h
+++ b/lite/core/mir/type_layout_cast_pass.h
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
--- a/lite/core/mir/xpu_pattern_matcher_high_api.h
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/profile/basic_profiler.cc
+++ b/lite/core/profile/basic_profiler.cc
--- a/lite/core/profile/basic_profiler.h
+++ b/lite/core/profile/basic_profiler.h
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/program_fake_utils.h
+++ b/lite/core/program_fake_utils.h
--- a/lite/core/scope.cc
+++ b/lite/core/scope.cc
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
--- a/lite/core/type_system.cc
+++ b/lite/core/type_system.cc
--- a/lite/core/type_system.h
+++ b/lite/core/type_system.h
--- a/lite/core/types.cc
+++ b/lite/core/types.cc
--- a/lite/core/types.h
+++ b/lite/core/types.h
--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
--- a/lite/demo/cxx/cuda_demo/demo.cc
+++ b/lite/demo/cxx/cuda_demo/demo.cc
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
--- a/lite/demo/cxx/mask_detection/prepare.sh
+++ b/lite/demo/cxx/mask_detection/prepare.sh
--- a/lite/demo/cxx/mask_detection/run.sh
+++ b/lite/demo/cxx/mask_detection/run.sh
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
--- a/lite/demo/cxx/ssd_detection/ssd_detection.cc
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
--- a/lite/demo/cxx/test_cv/README.md
+++ b/lite/demo/cxx/test_cv/README.md
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
--- a/lite/demo/cxx/test_libs/README.md
+++ b/lite/demo/cxx/test_libs/README.md
--- a/lite/demo/cxx/test_libs/classification_full.cc
+++ b/lite/demo/cxx/test_libs/classification_full.cc
--- a/lite/demo/cxx/test_libs/classification_light.cc
+++ b/lite/demo/cxx/test_libs/classification_light.cc
--- a/lite/demo/cxx/test_libs/prepare.sh
+++ b/lite/demo/cxx/test_libs/prepare.sh
--- a/lite/demo/cxx/test_libs/run.sh
+++ b/lite/demo/cxx/test_libs/run.sh
--- a/lite/demo/cxx/test_libs/test_helper.cc
+++ b/lite/demo/cxx/test_libs/test_helper.cc
--- a/lite/demo/cxx/test_libs/test_helper.h
+++ b/lite/demo/cxx/test_libs/test_helper.h
--- a/lite/demo/cxx/test_libs/yolov3_full.cc
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
--- a/lite/demo/cxx/test_libs/yolov3_light.cc
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
--- a/lite/demo/cxx/train_demo/README.md
+++ b/lite/demo/cxx/train_demo/README.md
--- a/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
+++ b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
--- a/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
+++ b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
--- a/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
+++ b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
--- a/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
+++ b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
--- a/lite/demo/cxx/train_demo/cplus_train/run_build.sh
+++ b/lite/demo/cxx/train_demo/cplus_train/run_build.sh
--- a/lite/demo/cxx/train_demo/image/lr_loss.png
+++ b/lite/demo/cxx/train_demo/image/lr_loss.png
--- a/lite/demo/cxx/train_demo/train.py
+++ b/lite/demo/cxx/train_demo/train.py
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc
--- a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
+++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
--- a/lite/demo/java/README.md
+++ b/lite/demo/java/README.md
--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/apu/CMakeLists.txt
+++ b/lite/kernels/apu/CMakeLists.txt
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
--- a/lite/kernels/apu/bridges/act_op.cc
+++ b/lite/kernels/apu/bridges/act_op.cc
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
--- a/lite/kernels/arm/activation_grad_compute.cc
+++ b/lite/kernels/arm/activation_grad_compute.cc
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/activation_grad_compute.h
--- a/lite/kernels/arm/argmax_compute.cc
+++ b/lite/kernels/arm/argmax_compute.cc
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
--- a/lite/kernels/arm/assign_value_compute.cc
+++ b/lite/kernels/arm/assign_value_compute.cc
--- a/lite/kernels/arm/assign_value_compute.h
+++ b/lite/kernels/arm/assign_value_compute.h
--- a/lite/kernels/arm/beam_search_compute.cc
+++ b/lite/kernels/arm/beam_search_compute.cc
--- a/lite/kernels/arm/beam_search_compute.h
+++ b/lite/kernels/arm/beam_search_compute.h
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
--- a/lite/kernels/arm/cast_compute.h
+++ b/lite/kernels/arm/cast_compute.h
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
--- a/lite/kernels/arm/concat_compute.h
+++ b/lite/kernels/arm/concat_compute.h
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
--- a/lite/kernels/arm/elementwise_grad_compute.cc
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/arm/logical_compute.h
--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.h
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
--- a/lite/kernels/arm/increment_compute.cc
+++ b/lite/kernels/arm/increment_compute.cc
--- a/lite/kernels/arm/increment_compute.h
+++ b/lite/kernels/arm/increment_compute.h
--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
--- a/lite/kernels/arm/lod_reset_compute.cc
+++ b/lite/kernels/arm/lod_reset_compute.cc
--- a/lite/kernels/arm/lod_reset_compute.h
+++ b/lite/kernels/arm/lod_reset_compute.h
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
--- a/lite/kernels/arm/lookup_table_dequant_compute.cc
+++ b/lite/kernels/arm/lookup_table_dequant_compute.cc
--- a/lite/kernels/arm/lookup_table_dequant_compute.h
+++ b/lite/kernels/arm/lookup_table_dequant_compute.h
--- a/lite/kernels/arm/lstm_compute.cc
+++ b/lite/kernels/arm/lstm_compute.cc
--- a/lite/kernels/arm/expand_compute.h
+++ b/lite/kernels/arm/expand_compute.h
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
--- a/lite/kernels/arm/mean_compute.h
+++ b/lite/kernels/arm/mean_compute.h
--- a/lite/kernels/arm/mean_grad_compute.cc
+++ b/lite/kernels/arm/mean_grad_compute.cc
--- a/lite/kernels/arm/mean_grad_compute.h
+++ b/lite/kernels/arm/mean_grad_compute.h
--- a/lite/kernels/arm/mul_grad_compute.cc
+++ b/lite/kernels/arm/mul_grad_compute.cc
--- a/lite/kernels/arm/is_empty_compute.h
+++ b/lite/kernels/arm/is_empty_compute.h
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
--- a/lite/kernels/arm/scale_compute.h
+++ b/lite/kernels/arm/scale_compute.h
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
--- a/lite/kernels/arm/sequence_conv_compute.h
+++ b/lite/kernels/arm/sequence_conv_compute.h
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
--- a/lite/kernels/arm/sgd_compute.cc
+++ b/lite/kernels/arm/sgd_compute.cc
--- a/lite/kernels/arm/sgd_compute.h
+++ b/lite/kernels/arm/sgd_compute.h
--- a/lite/kernels/arm/topk_compute.cc
+++ b/lite/kernels/arm/topk_compute.cc
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
--- a/lite/kernels/bm/bridges/assign_value_op.cc
+++ b/lite/kernels/bm/bridges/assign_value_op.cc
--- a/lite/kernels/bm/bridges/box_coder_op.cc
+++ b/lite/kernels/bm/bridges/box_coder_op.cc
--- a/lite/kernels/bm/bridges/cast_op.cc
+++ b/lite/kernels/bm/bridges/cast_op.cc
--- a/lite/kernels/bm/bridges/concat_op.cc
+++ b/lite/kernels/bm/bridges/concat_op.cc
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
--- a/lite/kernels/bm/bridges/conv_transpose_op.cc
+++ b/lite/kernels/bm/bridges/conv_transpose_op.cc
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/fill_constant_op.cc
+++ b/lite/kernels/bm/bridges/fill_constant_op.cc
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
--- a/lite/kernels/bm/bridges/prior_box_op.cc
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
--- a/lite/kernels/bm/bridges/reduce_full_op.cc
+++ b/lite/kernels/bm/bridges/reduce_full_op.cc
--- a/lite/kernels/bm/bridges/shape_op.cc
+++ b/lite/kernels/bm/bridges/shape_op.cc
--- a/lite/kernels/bm/bridges/slice_op.cc
+++ b/lite/kernels/bm/bridges/slice_op.cc
--- a/lite/kernels/bm/bridges/split_op.cc
+++ b/lite/kernels/bm/bridges/split_op.cc
--- a/lite/kernels/bm/bridges/squeeze_op.cc
+++ b/lite/kernels/bm/bridges/squeeze_op.cc
--- a/lite/kernels/bm/bridges/swish_op.cc
+++ b/lite/kernels/bm/bridges/swish_op.cc
--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
--- a/lite/kernels/bm/bridges/yolo_box_op.cc
+++ b/lite/kernels/bm/bridges/yolo_box_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/abs_compute.cu
+++ b/lite/kernels/cuda/abs_compute.cu
--- a/lite/kernels/cuda/abs_compute.h
+++ b/lite/kernels/cuda/abs_compute.h
--- a/lite/kernels/cuda/abs_compute_test.cc
+++ b/lite/kernels/cuda/abs_compute_test.cc
--- a/lite/kernels/cuda/attention_padding_mask_compute.cu
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
--- a/lite/kernels/cuda/elementwise_compute.cu
+++ b/lite/kernels/cuda/elementwise_compute.cu
--- a/lite/kernels/cuda/elementwise_compute.h
+++ b/lite/kernels/cuda/elementwise_compute.h
--- a/lite/kernels/cuda/fetch_compute.cc
+++ b/lite/kernels/cuda/fetch_compute.cc
--- a/lite/kernels/cuda/fetch_compute.h
+++ b/lite/kernels/cuda/fetch_compute.h
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
--- a/lite/kernels/cuda/search_group_padding_compute.cu
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
--- a/lite/kernels/cuda/sequence_pool_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
--- a/lite/kernels/cuda/sequence_pool_compute.h
+++ b/lite/kernels/cuda/sequence_pool_compute.h
--- a/lite/kernels/cuda/tanh_compute.cu
+++ b/lite/kernels/cuda/tanh_compute.cu
--- a/lite/kernels/cuda/tanh_compute.h
+++ b/lite/kernels/cuda/tanh_compute.h
--- a/lite/kernels/cuda/tanh_compute_test.cc
+++ b/lite/kernels/cuda/tanh_compute_test.cc
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
--- a/lite/kernels/fpga/cast_compute.cc
+++ b/lite/kernels/fpga/cast_compute.cc
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/arm/read_from_array_compute.h
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
--- a/lite/kernels/host/compare_compute.h
+++ b/lite/kernels/host/compare_compute.h
--- a/lite/kernels/host/crf_decoding_compute.cc
+++ b/lite/kernels/host/crf_decoding_compute.cc
--- a/lite/kernels/host/crf_decoding_compute.h
+++ b/lite/kernels/host/crf_decoding_compute.h
--- a/lite/kernels/host/ctc_align_compute.cc
+++ b/lite/kernels/host/ctc_align_compute.cc
--- a/lite/kernels/host/ctc_align_compute.h
+++ b/lite/kernels/host/ctc_align_compute.h
--- a/lite/kernels/arm/expand_compute.cc
+++ b/lite/kernels/arm/expand_compute.cc
--- a/lite/kernels/host/expand_compute.h
+++ b/lite/kernels/host/expand_compute.h
--- a/lite/kernels/arm/is_empty_compute.cc
+++ b/lite/kernels/arm/is_empty_compute.cc
--- a/lite/kernels/host/is_empty_compute.h
+++ b/lite/kernels/host/is_empty_compute.h
--- a/lite/kernels/arm/logical_compute.cc
+++ b/lite/kernels/arm/logical_compute.cc
--- a/lite/kernels/host/logical_compute.h
+++ b/lite/kernels/host/logical_compute.h
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
--- a/lite/kernels/host/multiclass_nms_compute_test.cc
+++ b/lite/kernels/host/multiclass_nms_compute_test.cc
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/arm/read_from_array_compute.cc
--- a/lite/kernels/host/read_from_array_compute.h
+++ b/lite/kernels/host/read_from_array_compute.h
--- a/lite/kernels/host/reshape_compute_test.cc
+++ b/lite/kernels/host/reshape_compute_test.cc
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
--- a/lite/kernels/arm/shape_compute.h
+++ b/lite/kernels/arm/shape_compute.h
--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
--- a/lite/kernels/arm/unsqueeze_compute.h
+++ b/lite/kernels/arm/unsqueeze_compute.h
--- a/lite/kernels/host/write_to_array_compute.cc
+++ b/lite/kernels/host/write_to_array_compute.cc
--- a/lite/kernels/host/write_to_array_compute.h
+++ b/lite/kernels/host/write_to_array_compute.h
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op_test.cc
+++ b/lite/kernels/npu/bridges/sqrt_op_test.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/xpu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/scale_op_test.cc
+++ b/lite/kernels/npu/bridges/scale_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/npu/bridges/square_op_test.cc
+++ b/lite/kernels/npu/bridges/square_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/calib_compute.cc
+++ b/lite/kernels/mlu/calib_compute.cc
--- a/lite/kernels/mlu/calib_compute.h
+++ b/lite/kernels/mlu/calib_compute.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
--- a/lite/kernels/npu/bridges/compare_op.cc
+++ b/lite/kernels/npu/bridges/compare_op.cc
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/dropout_op.cc
+++ b/lite/kernels/npu/bridges/dropout_op.cc
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
--- a/lite/kernels/npu/bridges/fill_constant_batch_size_like_op.cc
+++ b/lite/kernels/npu/bridges/fill_constant_batch_size_like_op.cc
--- a/lite/kernels/npu/bridges/fill_constant_op.cc
+++ b/lite/kernels/npu/bridges/fill_constant_op.cc
--- a/lite/kernels/npu/bridges/gather_op.cc
+++ b/lite/kernels/npu/bridges/gather_op.cc
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/increment_op.cc
+++ b/lite/kernels/npu/bridges/increment_op.cc
--- a/lite/kernels/npu/bridges/instance_norm_op.cc
+++ b/lite/kernels/npu/bridges/instance_norm_op.cc
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
--- a/lite/kernels/npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/npu/bridges/layer_norm_op.cc
--- a/lite/kernels/npu/bridges/lookup_table_op.cc
+++ b/lite/kernels/npu/bridges/lookup_table_op.cc
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/shape_op.cc
+++ b/lite/kernels/npu/bridges/shape_op.cc
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/sigmoid_compute.cc
+++ b/lite/kernels/opencl/sigmoid_compute.cc
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
--- a/lite/kernels/opencl/concat_compute.cc
+++ b/lite/kernels/opencl/concat_compute.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_compute_test.cc
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
--- a/lite/kernels/opencl/scale_compute.cc
+++ b/lite/kernels/opencl/scale_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_add_image_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_compute.h
+++ b/lite/kernels/opencl/elementwise_mul_compute.h
--- a/lite/kernels/opencl/elementwise_sub_image_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute_test.cc
--- a/lite/kernels/opencl/fc_compute.cc
+++ b/lite/kernels/opencl/fc_compute.cc
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_compute_test.cc
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
--- a/lite/kernels/opencl/fusion_elementwise_sub_activation_image_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_sub_activation_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
--- a/lite/backends/opencl/cl_im2col_test.cc
+++ b/lite/backends/opencl/cl_im2col_test.cc
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/instance_norm_image_compute_test.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute_test.cc
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
--- a/lite/kernels/opencl/io_copy_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_compute_test.cc
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
--- a/lite/kernels/opencl/mul_compute.cc
+++ b/lite/kernels/opencl/mul_compute.cc
--- a/lite/kernels/opencl/mul_compute_test.cc
+++ b/lite/kernels/opencl/mul_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_compute.cc
--- a/lite/kernels/opencl/nearest_interp_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_compute_test.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
--- a/lite/kernels/opencl/relu_compute.cc
+++ b/lite/kernels/opencl/relu_compute.cc
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ b/lite/kernels/opencl/relu_compute_test.cc
--- a/lite/kernels/opencl/reshape_compute.cc
+++ b/lite/kernels/opencl/reshape_compute.cc
--- a/lite/kernels/opencl/reshape_compute_test.cc
+++ b/lite/kernels/opencl/reshape_compute_test.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/scale_compute_test.cc
+++ b/lite/kernels/opencl/scale_compute_test.cc
--- a/lite/kernels/opencl/sigmoid_compute_test.cc
+++ b/lite/kernels/opencl/sigmoid_compute_test.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
--- a/lite/kernels/opencl/test_helper.h
+++ b/lite/kernels/opencl/test_helper.h
--- a/lite/kernels/rknpu/CMakeLists.txt
+++ b/lite/kernels/rknpu/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/CMakeLists.txt
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/act_op.cc
+++ b/lite/kernels/rknpu/bridges/act_op.cc
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
--- a/lite/kernels/rknpu/bridges/graph.cc
+++ b/lite/kernels/rknpu/bridges/graph.cc
--- a/lite/kernels/rknpu/bridges/graph.h
+++ b/lite/kernels/rknpu/bridges/graph.h
--- a/lite/kernels/rknpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
--- a/lite/kernels/rknpu/bridges/utility.cc
+++ b/lite/kernels/rknpu/bridges/utility.cc
--- a/lite/kernels/rknpu/bridges/utility.h
+++ b/lite/kernels/rknpu/bridges/utility.h
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
--- a/lite/kernels/x86/elementwise_compute.cc
+++ b/lite/kernels/x86/elementwise_compute.cc
--- a/lite/kernels/x86/elementwise_compute.h
+++ b/lite/kernels/x86/elementwise_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/gelu_compute_test.cc
+++ b/lite/kernels/x86/gelu_compute_test.cc
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_conv_compute.cc
+++ b/lite/kernels/x86/sequence_conv_compute.cc
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ b/lite/kernels/x86/sequence_pool_compute_test.cc
--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/sequence_unpad_compute.cc
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ b/lite/kernels/x86/softmax_compute_test.cc
--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/tanh_compute_test.cc
+++ b/lite/kernels/x86/tanh_compute_test.cc
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.cc
+++ b/lite/kernels/xpu/batch_norm_compute.cc
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
--- a/lite/kernels/xpu/bridges/dropout_op.cc
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
--- a/lite/kernels/xpu/bridges/gather_op.cc
+++ b/lite/kernels/xpu/bridges/gather_op.cc
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
--- a/lite/kernels/xpu/bridges/lookup_table_op.cc
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
--- a/lite/kernels/xpu/bridges/scale_op.cc
+++ b/lite/kernels/xpu/bridges/scale_op.cc
--- a/lite/kernels/xpu/bridges/slice_op.cc
+++ b/lite/kernels/xpu/bridges/slice_op.cc
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
--- a/lite/kernels/xpu/bridges/stack_op.cc
+++ b/lite/kernels/xpu/bridges/stack_op.cc
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/bridges/yolo_box_op.cc
+++ b/lite/kernels/xpu/bridges/yolo_box_op.cc
--- a/lite/kernels/xpu/cast_compute.cc
+++ b/lite/kernels/xpu/cast_compute.cc
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/conv_compute.cc
+++ b/lite/kernels/xpu/conv_compute.cc
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.cc
+++ b/lite/kernels/xpu/dropout_compute.cc
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/io_copy_compute.cc
+++ b/lite/kernels/xpu/io_copy_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.cc
+++ b/lite/kernels/xpu/layer_norm_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/matmul_compute.cc
+++ b/lite/kernels/xpu/matmul_compute.cc
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.cc
+++ b/lite/kernels/xpu/mul_compute.cc
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
--- a/lite/kernels/xpu/slice_compute.h
+++ b/lite/kernels/xpu/slice_compute.h
--- a/lite/kernels/xpu/softmax_compute.cc
+++ b/lite/kernels/xpu/softmax_compute.cc
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/desc_apis.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
--- a/lite/model_parser/naive_buffer/CMakeLists.txt
+++ b/lite/model_parser/naive_buffer/CMakeLists.txt
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.h
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
--- a/lite/operators/__xpu__fc_op.h
+++ b/lite/operators/__xpu__fc_op.h
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
--- a/lite/operators/__xpu__multi_encoder_op.h
+++ b/lite/operators/__xpu__multi_encoder_op.h
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
--- a/lite/operators/__xpu__resnet50_op.h
+++ b/lite/operators/__xpu__resnet50_op.h
--- a/lite/operators/activation_extra_ops.cc
+++ b/lite/operators/activation_extra_ops.cc
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
--- a/lite/operators/activation_grad_ops.h
+++ b/lite/operators/activation_grad_ops.h
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
--- a/lite/operators/affine_channel_op.cc
+++ b/lite/operators/affine_channel_op.cc
--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
--- a/lite/operators/anchor_generator_op.cc
+++ b/lite/operators/anchor_generator_op.cc
--- a/lite/operators/anchor_generator_op.h
+++ b/lite/operators/anchor_generator_op.h
--- a/lite/operators/argmax_op.cc
+++ b/lite/operators/argmax_op.cc
--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
--- a/lite/operators/axpy_op.cc
+++ b/lite/operators/axpy_op.cc
--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
--- a/lite/operators/beam_search_decode_op.cc
+++ b/lite/operators/beam_search_decode_op.cc
--- a/lite/operators/beam_search_decode_op.h
+++ b/lite/operators/beam_search_decode_op.h
--- a/lite/operators/beam_search_op.cc
+++ b/lite/operators/beam_search_op.cc
--- a/lite/operators/beam_search_op.h
+++ b/lite/operators/beam_search_op.h
--- a/lite/operators/box_clip_op.cc
+++ b/lite/operators/box_clip_op.cc
--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
--- a/lite/operators/box_coder_op.cc
+++ b/lite/operators/box_coder_op.cc
--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
--- a/lite/operators/cast_op.cc
+++ b/lite/operators/cast_op.cc
--- a/lite/operators/cast_op.h
+++ b/lite/operators/cast_op.h
--- a/lite/operators/collect_fpn_proposals_op.cc
+++ b/lite/operators/collect_fpn_proposals_op.cc
--- a/lite/operators/collect_fpn_proposals_op.h
+++ b/lite/operators/collect_fpn_proposals_op.h
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
--- a/lite/operators/crf_decoding_op.cc
+++ b/lite/operators/crf_decoding_op.cc
--- a/lite/kernels/opencl/concat_compute.h
+++ b/lite/kernels/opencl/concat_compute.h
--- a/lite/operators/crop_op.cc
+++ b/lite/operators/crop_op.cc
--- a/lite/operators/crop_op.h
+++ b/lite/operators/crop_op.h
--- a/lite/operators/ctc_align_op.cc
+++ b/lite/operators/ctc_align_op.cc
--- a/lite/operators/ctc_align_op.h
+++ b/lite/operators/ctc_align_op.h
--- a/lite/operators/decode_bboxes_op.cc
+++ b/lite/operators/decode_bboxes_op.cc
--- a/lite/operators/decode_bboxes_op.h
+++ b/lite/operators/decode_bboxes_op.h
--- a/lite/operators/density_prior_box_op.cc
+++ b/lite/operators/density_prior_box_op.cc
--- a/lite/operators/density_prior_box_op.h
+++ b/lite/operators/density_prior_box_op.h
--- a/lite/operators/distribute_fpn_proposals_op.cc
+++ b/lite/operators/distribute_fpn_proposals_op.cc
--- a/lite/operators/distribute_fpn_proposals_op.h
+++ b/lite/operators/distribute_fpn_proposals_op.h
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
--- a/lite/operators/dropout_op.h
+++ b/lite/operators/dropout_op.h
--- a/lite/operators/elementwise_grad_ops.cc
+++ b/lite/operators/elementwise_grad_ops.cc
--- a/lite/operators/elementwise_grad_ops.h
+++ b/lite/operators/elementwise_grad_ops.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
--- a/lite/operators/expand_op.h
+++ b/lite/operators/expand_op.h
--- a/lite/operators/fake_channel_wise_dequantize_max_abs.h
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.h
--- a/lite/operators/fake_dequantize_max_abs.h
+++ b/lite/operators/fake_dequantize_max_abs.h
--- a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
--- a/lite/operators/fake_quantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_moving_avg_max_abs.h
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
--- a/lite/operators/feed_op.cc
+++ b/lite/operators/feed_op.cc
--- a/lite/operators/fetch_op.cc
+++ b/lite/operators/fetch_op.cc
--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
--- a/lite/operators/fill_constant_batch_size_like_op.h
+++ b/lite/operators/fill_constant_batch_size_like_op.h
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
--- a/lite/operators/fill_constant_op.h
+++ b/lite/operators/fill_constant_op.h
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
--- a/lite/operators/flatten_op.h
+++ b/lite/operators/flatten_op.h
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
--- a/lite/operators/fusion_elementwise_activation_ops.h
+++ b/lite/operators/fusion_elementwise_activation_ops.h