Merge branch 'develop' into int8

f16090b5 · HappyAngel · GitHub · fe5099f4 · 3c1d9817 · f16090b5
377 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,6 +221,7 @@ endif()
 if(LITE_WITH_MLU)
    include(mlu)
 endif()
+include(coveralls)

 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package

--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -20,6 +20,9 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
    # will be converted from the format "1;2;3" to "1 2 3".
    set(COVERAGE_SRCS "")
    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
+        if ("${SINGLE_SRC}" MATCHES "/Paddle-Lite/third-party/*")
+            continue()
+        endif()
        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
    endforeach()

@@ -62,7 +65,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
 endfunction()

 if(WITH_COVERAGE)
-    set(CMAKE_BUILD_TYPE "Debug")
+    #set(CMAKE_BUILD_TYPE "Debug")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")

@@ -95,9 +98,11 @@ if(WITH_COVERAGE)
        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
    endforeach()

+    set(COVERALLS_UPLOAD ON)
    code_coverage(
        "${PADDLE_SRCS}"
        ${COVERALLS_UPLOAD}
        "${PROJECT_SOURCE_DIR}/cmake"
    )
 endif()
+
--- a/docs/api_reference/python_api/CxxConfig.md
+++ b/docs/api_reference/python_api/CxxConfig.md
+## CxxConfig
+
+```python
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数(该接口只支持armlinux)
+# config.set_threads(4);
+# 设置能耗模式(该接口只支持armlinux)
+# config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = lite.create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式，该接口只支持`armlinux`平台。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数，该接口只支持`armlinux`平台。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
--- a/docs/api_reference/python_api/CxxPredictor.md
+++ b/docs/api_reference/python_api/CxxPredictor.md
+## CxxPredictor
+
+```c++
+class CxxPredictor
+```
+
+`CxxPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
--- a/docs/api_reference/python_api/LightPredictor.md
+++ b/docs/api_reference/python_api/LightPredictor.md
+## LightPredictor
+
+```c++
+class LightPredictor
+```
+
+`LightPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from __future__ import print_function
+from paddlelite.lite import *
+
+# 1. 设置MobileConfig
+config = MobileConfig()
+config.set_model_dir(args.model_dir)
+
+# 2. 创建LightPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
--- a/docs/api_reference/python_api/MobileConfig.md
+++ b/docs/api_reference/python_api/MobileConfig.md
+## MobileConfig
+
+```python
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建LightPredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_from_file(<your_model_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_from_file(model_file)`
+
+**注意**：`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(str)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数，该接口只支持`armlinux`平台。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数，该接口只支持`armlinux`平台。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
--- a/docs/api_reference/python_api/PowerMode.md
+++ b/docs/api_reference/python_api/PowerMode.md
+## PowerMode
+
+```python
+class PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_dir(<your_model_dir_path>)
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
--- a/docs/api_reference/python_api/Tensor.md
+++ b/docs/api_reference/python_api/Tensor.md
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```python
+from paddlelite.lite import *
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(list)` - 维度信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`list`
+
+
+
+### `float_data()`
+
+获取Tensor的持有的float型数据。
+
+示例：
+
+```python
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`持有的float型数据
+
+返回类型：`list`
+
+
+
+### `set_float_data(float_data)`
+
+设置Tensor持有float数据。
+
+示例：
+
+```python
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+```
+
+参数：
+
+- `float_data(list)` - 待设置的float型数据
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_lod(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(list[list])` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`list[list]`
--- a/docs/api_reference/python_api/TypePlace.md
+++ b/docs/api_reference/python_api/TypePlace.md
+## TargetType
+
+```python
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```python
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```python
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```python
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```python
+from lite_core import *
+
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
--- a/docs/api_reference/python_api/create_paddle_predictor.md
+++ b/docs/api_reference/python_api/create_paddle_predictor.md
+
+## create_paddle_predictor
+
+```python
+CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型
+LightPredictor create_paddle_predictor(config); # config为MobileConfig类型
+```
+
+`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。
+
+示例：
+
+```python
+from paddlelite.lite import *
+
+# 设置CxxConfig
+config = CxxConfig()
+config.set_model_dir(<your_model_dir_path>)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+参数：
+
+- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：预测器`predictor`
+
+返回类型：`CxxPredictor`或`LightPredictor`
--- a/docs/api_reference/python_api/opt.md
+++ b/docs/api_reference/python_api/opt.md
+## Opt
+
+```python
+class Opt;
+```
+
+`Opt`模型离线优化接口，Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。
+
+示例：  
+
+假设待转化模型问当前文件夹下的`mobilenet_v1`，可以使用以下脚本转换
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+
+# 1. 创建opt实例
+opt=Opt()
+# 2. 指定输入模型地址 
+opt.set_model_dir("./mobilenet_v1")
+# 3. 指定转化类型： arm、x86、opencl、xpu、npu
+opt.set_valid_places("arm")
+# 4. 指定模型转化类型： naive_buffer、protobuf
+opt.set_model_type("naive_buffer")
+# 4. 输出模型地址
+opt.set_optimize_out("mobilenetv1_opt")
+# 5. 执行模型优化
+opt.run()
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+
+### `set_model_type(type)`
+
+设置模型的输出类型，当前支持`naive_buffer`和`protobuf`两种格式，移动端预测需要转化为`naive_buffer`
+
+参数：
+
+- `type(str)` - 模型格式（`naive_buffer/protobuf`)
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(str)` - 可用place列表，不同place用`,`隔开
+
+示例：
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+
+# 1. 创建opt实例
+opt=Opt()
+# 2. 指定转化类型： arm、x86、opencl、xpu、npu
+opt.set_valid_places("arm, opencl")
+```
+
+
+
+
+### `set_optimize_out(optimized_model_name)`
+
+设置优化后模型的名称，优化后模型文件以`.nb`作为文件后缀。
+
+参数：
+
+- `optimized_model_name(str)`
+
+### `run()`
+
+执行模型优化，用以上接口设置完 `模型路径`、`model_type`、`optimize_out`和`valid_places`后，执行`run()`接口会根据以上设置转化模型，转化后模型保存在当前路径下。
+
+
+### `run_optimize(model_dir, model_file, param_file, type, valid_places, optimized_model_name)`
+
+执行模型优化，无需设置以上接口，直接指定 `模型路径`、`model_type`、`optimize_out`和`valid_places`并执行模型转化。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+- `model_file(str)` - 模型文件路径
+- `param_file(str)` - 模型文件路径
+- `type(str)` - 模型格式（`naive_buffer/protobuf`)
+- `valid_places(str)` - 可用place列表，不同place用`,`隔开
+- `optimized_model_name(str)`
+
+```python
+# 引用Paddlelite预测库
+from paddlelite.lite import *
+# 1. 创建opt实例
+opt=Opt()
+# 2. 执行模型优化
+opt.run_optimize("./mobilenet_v1","","","arm","mobilenetv1_opt");
+```
--- a/docs/api_reference/python_api_doc.md
+++ b/docs/api_reference/python_api_doc.md
 # Python API

-## create_paddle_predictor

-```python
-CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型
-LightPredictor create_paddle_predictor(config); # config为MobileConfig类型
-```
+### [create_paddle_predictor](./python_api/create_paddle_predictor)

-`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。
+创建预测执行器[`CxxPredictor`](./python_api/CxxPredictor)或者[`LightPredictor`](./python_api/LightPredictor)

-示例：
+### [Opt](./python_api/opt)

 ```python
-from lite_core import *
-
-# 设置CxxConfig
-config = CxxConfig()
-config.set_model_dir(<your_model_dir_path>)
-places = [Place(TargetType.ARM, PrecisionType.FP32)]
-config.set_valid_places(places)
-
-# 根据CxxConfig创建CxxPredictor
-predictor = create_paddle_predictor(config)
+class Opt;
 ```

-参数：
-
- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。
-
-返回：预测器`predictor`
-
-返回类型：`CxxPredictor`或`LightPredictor`
-
-## CxxConfig
+`Opt`模型离线优化接口，Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。

+### [CxxConfig](./python_api/CxxConfig)
 ```python
 class CxxConfig;
 ```

 `CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。

-示例：
-
-```python
-from lite_core import *
-
-config = CxxConfig()
-# 设置模型目录，加载非combined模型时使用
-config.set_model_dir(<your_model_dir_path>)
-# 设置工作线程数
-config.set_threads(4);
-# 设置能耗模式
-config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
-# 设置valid places
-places = [Place(TargetType.ARM, PrecisionType.FP32)]
-config.set_valid_places(places)
-
-# 根据CxxConfig创建CxxPredictor
-predictor = create_paddle_predictor(config)
-```
-
-### `set_model_dir(model_dir)`
-
-设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
-
-参数：
-
- `model_dir(str)` - 模型文件夹路径
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `model_dir()`
-
-返回设置的模型文件夹路径。
-
-参数：
-
- `None`
-
-返回：模型文件夹路径
-
-返回类型：`str`
-
-
-
-### `set_model_file(model_file)`
-
-设置模型文件路径，加载combined形式模型时使用。
-
-参数：
-
- `model_file(str)` - 模型文件路径
-
-返回类型：`None`
-
-
-
-### `model_file()`
-
-获取设置模型文件路径，加载combined形式模型时使用。
-
-参数：
-
- `None`
-
-返回：模型文件路径
-
-返回类型：`str`
-
-
-
-### `set_param_file(param_file)`

-设置模型参数文件路径，加载combined形式模型时使用。
-
-参数：
-
- `param_file(str)` - 模型文件路径
-
-返回类型：`None`
-
-
-
-### `param_file()`
-
-获取设置模型参数文件路径，加载combined形式模型时使用。
-
-参数：
-
- `None`
-
-返回：模型参数文件路径
-
-返回类型：`str`
-
-
-
-### `set_valid_places(valid_places)`
-
-设置可用的places列表。
-
-参数：
-
- `valid_places(list)` - 可用place列表。
-
-返回类型：`None`
-
-示例：
-
-```python
-from lite_core import *
-
-config = CxxConfig()
-# 设置模型目录，加载非combined模型时使用
-config.set_model_dir(<your_model_dir_path>)
-# 设置valid places
-# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
-# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
-places = [Place(TargetType.ARM, PrecisionType.INT8),
-          Place(TargetType.ARM, PrecisionType.FP32)]
-config.set_valid_places(places)
-
-# 根据CxxConfig创建CxxPredictor
-predictor = create_paddle_predictor(config)
-```
-
-
-
-### `set_power_mode(mode)`
-
-设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
-
-*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `mode(PowerMode)` - CPU能耗模式
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `power_mode()`
-
-获取设置的CPU能耗模式。
-
-*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `None`
-
-返回：设置的CPU能耗模式
-
-返回类型：`PowerMode`
-
-
-
-### `set_threads(threads)`
-
-设置工作线程数。若不设置，则默认使用单线程。
-
-*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `threads(int)` - 工作线程数
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `threads()`
-
-获取设置的工作线程数。
-
-*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `None`
-
-返回：工作线程数
-
-返回类型：`int`
-
-## MobileConfig
+### [MobileConfig](./python_api/MobileConfig)

 ```python
 class MobileConfig;
@@ -241,388 +29,31 @@ class MobileConfig;

 `MobileConfig`用来配置构建LightPredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。

-示例：
-
-```python
-from lite_core import *
-
-config = MobileConfig()
-# 设置NaiveBuffer格式模型目录
-config.set_model_from_file(<your_model_path>)
-# 设置工作线程数
-config.set_threads(4);
-# 设置能耗模式
-config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
-
-# 根据MobileConfig创建LightPredictor
-predictor = create_paddle_predictor(config)
-```
-
-### `set_model_from_file(model_file)`
-
-**注意**：`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。
-
-设置模型文件夹路径。
-
-参数：
-
- `model_file(str)` - 模型文件路径
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `set_model_dir(model_dir)`
-
-**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。
-
-设置模型文件夹路径。
-
-参数：
-
- `model_dir(str)` - 模型文件夹路径
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `set_model_from_buffer(model_buffer)`
-
-设置模型的内存数据，当需要从内存加载模型时使用。
-
-参数：
-
- `model_buffer(str)` - 内存中的模型数据
-
-返回：`None`
-
-返回类型：`void`
-
-
-
-
-### `model_dir()`
-
-返回设置的模型文件夹路径。
-
-参数：
-
- `None`
-
-返回：模型文件夹路径
-
-返回类型：`str`
-
-
-
-### `set_power_mode(mode)`
-
-设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
-
-*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `mode(PowerMode)` - CPU能耗模式
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `power_mode()`
-
-获取设置的CPU能耗模式。
-
-*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `None`
-
-返回：设置的CPU能耗模式
-
-返回类型：`PowerMode`
-
-
-
-### `set_threads(threads)`
-
-设置工作线程数。若不设置，则默认使用单线程。
-
-*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：
-
- `threads(int)` - 工作线程数
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `threads()`
-
-获取设置的工作线程数。
-
-*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
-
-参数：

- `None`
+### [CxxPredictor](./python_api/CxxPredictor)

-返回：工作线程数
-
-返回类型：`int`
-
-## CxxPredictor
-
-```c++
+```python
 class CxxPredictor
 ```

 `CxxPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。

-示例：
-
-```python
-from __future__ import print_function
-from lite_core import *
-
-# 1. 设置CxxConfig
-config = CxxConfig()
-if args.model_file != '' and args.param_file != '':
-    config.set_model_file(args.model_file)
-    config.set_param_file(args.param_file)
-else:
-    config.set_model_dir(args.model_dir)
-places = [Place(TargetType.ARM, PrecisionType.FP32)]
-config.set_valid_places(places)
-
-# 2. 创建CxxPredictor
-predictor = create_paddle_predictor(config)
-
-# 3. 设置输入数据
-input_tensor = predictor.get_input(0)
-input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
-
-# 4. 运行模型
-predictor.run()
-
-# 5. 获取输出数据
-output_tensor = predictor.get_output(0)
-print(output_tensor.shape())
-print(output_tensor.float_data()[:10])
-```
-
-### `get_input(index)`
-
-获取输入Tensor，用来设置模型的输入数据。
-
-参数：
-
- `index(int)` - 输入Tensor的索引
-
-返回：第`index`个输入`Tensor`
-
-返回类型：`Tensor`
-
-
-
-### `get_output(index)`
-
-获取输出Tensor，用来获取模型的输出结果。
-
-参数：
-
- `index(int)` - 输出Tensor的索引
-
-返回：第`index`个输出`Tensor`
-
-返回类型：`Tensor`
-
-
-
-### `run()`
-
-执行模型预测，需要在***设置输入数据后***调用。
-
-参数：
-
- `None`
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `get_version()`
-
-用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
-
-参数：
-
- `None`
-
-返回：当前lib使用的代码版本信息
-
-返回类型：`str`
-
-## LightPredictor
-
-```c++
-class LightPredictor
-```
-
-`LightPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。

-示例：
-
-```python
-from __future__ import print_function
-from lite_core import *
-
-# 1. 设置MobileConfig
-config = MobileConfig()
-config.set_model_dir(args.model_dir)
-
-# 2. 创建LightPredictor
-predictor = create_paddle_predictor(config)
-
-# 3. 设置输入数据
-input_tensor = predictor.get_input(0)
-input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
-
-# 4. 运行模型
-predictor.run()
-
-# 5. 获取输出数据
-output_tensor = predictor.get_output(0)
-print(output_tensor.shape())
-print(output_tensor.float_data()[:10])
-```
-
-### `get_input(index)`
-
-获取输入Tensor，用来设置模型的输入数据。
-
-参数：
-
- `index(int)` - 输入Tensor的索引
-
-返回：第`index`个输入`Tensor`
-
-返回类型：`Tensor`
-
-
-
-### `get_output(index)`
-
-获取输出Tensor，用来获取模型的输出结果。
-
-参数：
-
- `index(int)` - 输出Tensor的索引
-
-返回：第`index`个输出`Tensor`
-
-返回类型：`Tensor`

+### [TargetType 、PrecisionType、DataLayoutType、Place](./python_api/TypePlace)

-
-### `run()`
-
-执行模型预测，需要在***设置输入数据后***调用。
-
-参数：
-
- `None`
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `get_version()`
-
-用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
-
-参数：
-
- `None`
-
-返回：当前lib使用的代码版本信息
-
-返回类型：`str`
-
-## TargetType
-
-```python
-class TargetType;
-```
 `TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。

-枚举型变量`TargetType`的所有可能取值包括：
-
-`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
-
-
-## PrecisionType
-```python
-class PrecisionType {FP32};
-```
 `PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。

-枚举型变量`PrecisionType`的所有可能取值包括：
-
-`{FP32, INT8, INT32, INT64}`
-
-
-
-
-## DataLayoutType
-
-```python
-class DataLayoutType {NCHW};
-```
 `DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。

-枚举型变量`DataLayoutType`的所有可能取值包括：
-
-` {NCHW, NHWC}`
-
-
-
-## Place
-```python
-class Place{
-  TargetType target;
-  PrecisionType precision{FP32};
-  DataLayoutType layout{NCHW}
-}
-```
 `Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。

-示例：
-```python
-from lite_core import *
-
-Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
-```



-## PowerMode
+### [PowerMode](./python_api/PowerMode)

 ```python
 class PowerMode;
@@ -630,35 +61,9 @@ class PowerMode;

 `PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。

-示例：

-```python
-from lite_core import *

-config = MobileConfig()
-# 设置NaiveBuffer格式模型目录
-config.set_model_dir(<your_model_dir_path>)
-# 设置能耗模式
-config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
-
-# 根据MobileConfig创建LightPredictor
-predictor = create_paddle_predictor(config)
-```
-
-PowerMode详细说明如下：
-
-|         选项         | 说明                                                         |
-| :------------------: | ------------------------------------------------------------ |
-|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
-|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
-|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
-|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
-| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
-| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
-
-
-
-## Tensor
+### [Tensor](./python_api/Tensor)

 ```c++
 class Tensor
@@ -667,134 +72,3 @@ class Tensor
 Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。

 *注意：用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。*
-
-示例：
-
-```python
-from __future__ import print_function
-from lite_core import *
-
-# 1. 设置CxxConfig
-config = CxxConfig()
-if args.model_file != '' and args.param_file != '':
-    config.set_model_file(args.model_file)
-    config.set_param_file(args.param_file)
-else:
-    config.set_model_dir(args.model_dir)
-places = [Place(TargetType.ARM, PrecisionType.FP32)]
-config.set_valid_places(places)
-
-# 2. 创建CxxPredictor
-predictor = create_paddle_predictor(config)
-
-# 3. 设置输入数据
-input_tensor = predictor.get_input(0)
-input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
-
-# 4. 运行模型
-predictor.run()
-
-# 5. 获取输出数据
-output_tensor = predictor.get_output(0)
-print(output_tensor.shape())
-print(output_tensor.float_data()[:10])
-```
-
-### `resize(shape)`
-
-设置Tensor的维度信息。
-
-参数：
-
- `shape(list)` - 维度信息
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `shape()`
-
-获取Tensor的维度信息。
-
-参数：
-
- `None`
-
-返回：Tensor的维度信息
-
-返回类型：`list`
-
-
-
-### `float_data()`
-
-获取Tensor的持有的float型数据。
-
-示例：
-
-```python
-output_tensor = predictor.get_output(0)
-print(output_tensor.shape())
-print(output_tensor.float_data()[:10])
-```
-
-参数：
-
- `None`
-
-返回：`Tensor`持有的float型数据
-
-返回类型：`list`
-
-
-
-### `set_float_data(float_data)`
-
-设置Tensor持有float数据。
-
-示例：
-
-```python
-input_tensor = predictor.get_input(0)
-input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
-```
-
-参数：
-
- `float_data(list)` - 待设置的float型数据
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `set_lod(lod)`
-
-设置Tensor的LoD信息。
-
-参数：
-
- `lod(list[list])` - Tensor的LoD信息
-
-返回：`None`
-
-返回类型：`None`
-
-
-
-### `lod()`
-
-获取Tensor的LoD信息
-
-参数：
-
- `None`
-
-返回：`Tensor`的LoD信息
-
-返回类型：`list[list]`
--- a/docs/benchmark/benchmark.md
+++ b/docs/benchmark/benchmark.md
@@ -2,7 +2,7 @@

 可以参考[benchmark_tools](benchmark_tools)，推荐**一键benchmark**。

-## 测试环境
+## ARM测试环境

 * 测试模型
    * fp32模型
@@ -18,7 +18,7 @@

 * 测试机器(android ndk ndk-r17c)
   *  骁龙855
-      * xiaomi mi9, snapdragon 855 
+      * xiaomi mi9, snapdragon 855 (enable sdot instruction)
      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz

   *  骁龙845
@@ -33,12 +33,12 @@
      * HUAWEI Mate10
 
 * 测试说明
-    * branch: release/v2.3.0
+    * branch: release/v2.6.0
    * warmup=10, repeats=30，统计平均时间，单位是ms
    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
    
-## 测试数据
+## ARM测试数据


 ### fp32模型测试数据
@@ -48,75 +48,131 @@
 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 |
-mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 |
-shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 |
-squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 |
-mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 |
+mobilenet_v1 |35.11 |20.67 |11.83 |30.56 |18.59 |10.44 |
+mobilenet_v2 |26.36 |15.83 |9.29 |21.64 |13.25 |7.95 |
+shufflenet_v2 |4.56 |3.14 |2.35 |4.07 |2.89 |2.28 |
+squeezenet_v1.1 |21.27 |13.55 |8.49 |18.05 |11.51 |7.83 |
+mnasnet |21.40 |13.18 |7.63 |18.84 |11.40 |6.80 |


 骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 |
-mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 |
-shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 |
-squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 |
-mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 |
+mobilenet_v1 |65.56 |37.17 |19.65 |63.23 |32.98 |17.68 |
+mobilenet_v2 |45.89 |25.20 |14.39 |41.03 |22.94 |12.98 |
+shufflenet_v2 |7.31 |4.66 |3.27 |7.08 |4.71 |3.41 |
+squeezenet_v1.1 |36.98 |22.53 |13.45 |34.27 |20.96 |12.60 |
+mnasnet |39.85 |23.64 |12.25 |37.81 |20.70 |11.81 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 |
-mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 |
-shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 |
-squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 |
-mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 |
+mobilenet_v1 |92.77 |51.56 |30.14 |87.46 |48.02 |26.42 |
+mobilenet_v2 |65.78 |36.52 |22.34 |58.31 |33.04 |19.87 |
+shufflenet_v2 |10.39 |6.26 |4.46 |9.72 |6.19 |4.41 |
+squeezenet_v1.1 |53.59 |33.16 |20.13 |51.56 |31.81 |19.10 |
+mnasnet |57.44 |32.62 |19.47 |54.99 |30.69 |17.98 |

 #### caffe model

 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 |
-mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 |
-shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 |
+mobilenet_v1 |32.38 |18.65 |10.69 |30.75 |18.11 |9.88 |
+mobilenet_v2 |29.45 |17.86 |10.81 |26.61 |16.26 |9.67 |
+shufflenet_v2 |5.04 |3.14 |2.20 |4.09 |2.85 |2.25 |


 骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 |
-mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 |
-shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 |
+mobilenet_v1 |65.26 |35.19 |19.11 |61.42 |33.15 |17.48 |
+mobilenet_v2 |55.59 |31.31 |17.68 |51.54 |29.69 |16.00 |
+shufflenet_v2 |7.42 |4.73 |3.33 |7.18 |4.75 |3.39 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 |
-mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 |
-shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 |
+mobilenet_v1 |95.38 |52.16 |30.37 |92.10 |46.71 |26.31 |
+mobilenet_v2 |82.89 |45.49 |28.14 |74.91 |41.88 |25.25 |
+shufflenet_v2 |10.25 |6.36 |4.42 |9.68 |6.20 |4.42 |

 #### int8量化模型测试数据

 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
-mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
+mobilenet_v1 |37.18 |21.71 |11.16 | 14.41 |8.34 |4.37 |
+mobilenet_v2 |27.95 |16.57 |8.97 | 13.68 |8.16 |4.67 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |60.76 |32.25 |16.66 |56.57 |29.84 |15.24 |
-mobilenet_v2 |49.38 |31.10 |22.07 |47.52 |28.18 |19.24 |
+mobilenet_v1 |61.63 |32.60 |16.49 |57.36 |29.74 |15.50 |
+mobilenet_v2 |47.13 |25.62 |13.56 |41.87 |22.42 |11.72 |


 麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |65.95 |34.39 |18.68 |60.86 |30.98 |16.31 |
-mobilenet_v2 |68.87 |39.39 |24.43 |65.57 |37.31 |20.87 |
+mobilenet_v1 |63.13 |32.63 |16.85 |58.92 |29.96 |15.42 |
+mobilenet_v2 |48.60 |25.43 |13.76 |43.06 |22.10 |12.09 |
+
+
+## 华为麒麟NPU测试环境
+
+* 测试模型
+    * fp32模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * squeezenet_v1.1
+        * mnasnet
+
+* 测试机器(android ndk ndk-r17c)
+   *  麒麟810
+      * HUAWEI Nova5, Kirin 810
+      * 2xCortex A76 2.27GHz + 6xCortex A55 1.88GHz
+
+   *  麒麟990
+      * HUAWEI Mate 30, Kirin 990
+      * 2 x Cortex-A76 Based 2.86 GHz + 2 x Cortex-A76 Based 2.09 GHz + 4 x Cortex-A55 1.86 GHz
+
+   *  麒麟990 5G
+      * HUAWEI P40, Kirin 990 5G
+      * 2 x Cortex-A76 Based 2.86GHz + 2 x Cortex-A76 Based 2.36GHz + 4 x Cortex-A55 1.95GHz
+
+* HIAI ddk 版本： 310 or 320
+ 
+* 测试说明
+    * branch: release/v2.6.1
+    * warmup=10, repeats=30，统计平均时间，单位是ms
+    * 线程数为1，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH
+    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
+    
+## 华为麒麟NPU测试数据
+
+#### paddlepaddle model
+
+- ddk 310
+
+|Kirin |810||990||990 5G||
+|---|---|---|---|---|---|---|
+|  |cpu(ms) | npu(ms) |cpu(ms) | npu(ms) |cpu(ms) | npu(ms) |
+|mobilenet_v1|	 41.20|  12.76|  31.91|  4.07|  33.97|  3.20|
+|mobilenet_v2|	 29.57|  12.12|  22.47|  5.61|  23.17|  3.51|
+|squeezenet|  23.96|  9.04|  17.79|  3.82|	 18.65|  3.01|
+|mnasnet|  26.47|  13.62|  19.54|  5.17|	 20.34|  3.32|
+
+
+- ddk 320
+
+|模型 |990||990-5G||
+|---|---|---|---|---|
+||cpu(ms) | npu(ms) |cpu(ms) | npu(ms) |
+|ssd_mobilenetv1|  65.67|  18.21|  71.8|	16.6|
+
+
+*说明：ssd_mobilenetv1的npu性能为npu、cpu混合调度运行的总时间*
--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -28,63 +28,64 @@ List of devices attached
 执行以下命令，完成Benchmark：

 ```shell
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh
+# Test v2.6 branch
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.6/run_benchmark.sh
+sh run_benchmark.sh
+
+# Test v2.3 branch
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.3/run_benchmark.sh
 sh run_benchmark.sh
 ```

 该`run_benchmark.sh`脚本会：

-1. 下载模型，并上传手机：包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet；
+1. 下载模型，并上传手机：包含mobilenetv1、mobilenetv2、shufflenetv2、squeezenetv1.1、mnasnet、mobilenetv1_int8、mobilenetv2_int8；
 2. 下载pre-built android-armv7和android-armv8的可执行文件，并上传手机：`benchmark_bin_v7`和`benchmark_bin_v8`；
 3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
 4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。

 ## 二. 逐步Benchmark

-### 1. 获取benchmark可执行文件
-
-benchmark_bin文件可以测试PaddleLite的性能，有下面两种方式获得。
-
-#### 方式一：下载benchmark_bin可执行文件
-
-```shell
-# Download benchmark_bin for android-armv7
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7
-
-# Download benchmark_bin for android-armv8
-wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8
-```
-
-#### 方式二：由源码编译benchmark_bin文件
+### 1. 编译benchmark可执行文件

-根据[源码编译](../user_guides/source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+根据[源码编译](../user_guides/source_compile)准备编译环境，拉取PaddleLite最新特定分支代码，并在仓库根目录下，执行：

 ```shell
 ###########################################
 # Build benchmark_bin for android-armv7   #
 ###########################################
-./lite/tools/ci_build.sh  \
-  --arm_os="android" \
-  --arm_abi="armv7" \
-  --arm_lang="gcc " \
-  build_arm
+
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv7 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=ON \
+  --with_log=OFF \
+  full_publish

 # `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin

 ###########################################
 # Build benchmark_bin for android-armv8   #
 ###########################################
-./lite/tools/ci_build.sh  \
-  --arm_os="android" \
-  --arm_abi="armv8" \
-  --arm_lang="gcc "  \
-  build_arm
+
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=ON \
+  --with_log=OFF \
+  full_publish

 # `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
 ```

 > **注意**：为了避免在docker内部访问不到手机的问题，建议编译得到benchmark_bin后退出到docker外面，并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下，按照下面步骤下载模型、拷贝脚本、测试。

+> **注意**：如果不是测试常见分类模型（单输入，输入shape是1x3x224x224），需要根据实际情况修改`/PaddleLite/lite/api/benchmark.cc`文件，然后编译得到可执行文件。
+
 ### 2. 准备模型

 PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)。

--- a/docs/demo_guides/cpp_demo.md
+++ b/docs/demo_guides/cpp_demo.md
@@ -32,14 +32,26 @@ tar zxf mobilenet_v1.tar.gz

 ![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png)

-（2）下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型：
+（2）模型转换

-```shell
-wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
-chmod +x opt
-./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
-```

+  - v2.6.0版本之前
+
+  下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型
+
+  ```shell
+  wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
+  chmod +x opt
+  ./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+  ```
+  - v2.6.0版本以及后续版本
+
+  安装paddlelite，终端输入命令转化模型
+ 
+  ```shell
+  python -m pip install paddlelite
+  paddle_lite_opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+  ```
 **结果如下图所示：**

 ![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png)

--- a/docs/demo_guides/ios_app_demo.md
+++ b/docs/demo_guides/ios_app_demo.md
@@ -90,7 +90,7 @@ ios-detection_demo/detection_demo/ViewController.mm

 ## 代码讲解 （如何使用Paddle-Lite C++ API 执行预测）

-IOS 示例基于C++ API 开发，调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考： [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。
+IOS 示例基于C++ API 开发，调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考： [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html)。

 ```c++
 #include <iostream>

--- a/docs/demo_guides/linux_arm_demo.md
+++ b/docs/demo_guides/linux_arm_demo.md
+# Linux(ARM) Demo
+
+## 多种应用场景
+
+我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)，其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。Linux(ARM) demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。
+
+### 1. 图像分类
+
+Paddle-Lite提供的图像分类demo ，在移动端上提供了实时的物体识别能力，可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
+
+### 2. 物体检测
+
+Paddle-Lite提供的物体检测demo ，在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控（是否有违规物体或行为）、工业质检（微小瑕疵的数量和位置）、医疗诊断（细胞计数、中药识别）等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
+
+## Linux(ARM) demo部署方法
+
+下面我们以**目标检测（object_detection_demo)**为例讲解如何部署Linux(ARM)工程。
+
+**目的**：将基于Paddle-Lite的预测库部署到Linux(ARM)设备，实现物体检测的目标。
+
+**需要的环境**：Linux(ARM)设备、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+
+**部署步骤**：
+
+1、 目标检测的Linux(ARM)示例位于 `Paddle-Lite-Demo\PaddleLite-armlinux-demo\object_detection_demo`
+
+2、终端中执行 `download_models_and_libs.sh` 脚本自动下载模型和Paddle-Lite预测库
+
+```shell
+cd PaddleLite-armlinux-demo          # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-armlinux-demo
+sh download_models_and_libs.sh       # 2. 执行脚本下载依赖项 （需要联网）
+```
+
+下载完成后会出现提示： `Download successful!`
+
+3、终端中执行 `download_models_and_libs.sh` 脚本自动下载模型和Paddle-Lite预测库
+```shell
+cd object_detection_demo    # 1. 终端中进入
+sh run.sh                   # 2. 执行脚本编译并执行物体检测demo，输出预测数据和运行时间
+```
+demo结果如下:
+<img width="836" alt="image" src="https://user-images.githubusercontent.com/50474132/82852558-da228580-9f35-11ea-837c-e4d71066da57.png">
+
+## 使用C++接口预测
+Linux(ARM) demo 示例基于C++ API 开发，调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考： [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html)。
+
+```c++
+#include <iostream>
+// 引入C++ API
+#include "paddle_lite/paddle_api.h"
+#include "paddle_lite/paddle_use_ops.h"
+#include "paddle_lite/paddle_use_kernels.h"
+
+// 1. 设置MobileConfig
+MobileConfig config;
+config.set_model_from_file(<modelPath>); // 设置NaiveBuffer格式模型路径
+config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式
+config.set_threads(4); // 设置工作线程数
+
+// 2. 创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 3. 设置输入数据
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 4. 执行预测
+predictor->run();
+
+// 5. 获取输出数据
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+            << std::endl;
+}
+```
+
+## 使用Python接口预测
+
+1. Python预测库编译参考[编译Linux](../user_guides/Compile/Linux)，建议在开发版上编译。
+2. [Paddle-Lite Python API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/python_api_doc.html)。
+3. 代码参考，[Python预测](python_demo)
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
@@ -5,7 +5,7 @@ Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭

 ## 已支持的设备

- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30，以及即将推出的mate40、p40。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
+- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30、p40、p40 pro，以及即将推出的mate40、。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。

 ## 已支持的模型

@@ -13,9 +13,14 @@ Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭
 - MobileNetV2
 - ResNet-18/50
 - ShuffleNetV2
+- squeezenet
+- mnasnet
+- yolov3
 - CycleGAN (暂时需要华为内部rom的支持)
 - 百度内部业务模型（由于涉密，不方便透露具体细节）

+*CPU/NPU混合调度在部分模型可以获得更佳的性能*
+
 ## 已支持（或部分支持）的Paddle算子

 - sigmoid
@@ -64,8 +69,8 @@ Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭

 ## 编译支持NPU的Paddle Lite库

- 从https://developer.huawei.com/consumer/cn/hiai/下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如最新的[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[NPU编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_npu.sh)编译full_publish和tiny_publish。
+- 从[华为HiAI平台](https://developer.huawei.com/consumer/cn/hiai)下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
+- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_android.sh)编译 (需要指定NPU相关选项)。

 注意：以下是HiAI DDK V310版解压后的目录结构，需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
 ```shell
@@ -79,16 +84,11 @@ Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭
 - tools
 ```

- full_publish and tiny_publish for armv8，由于HiAI DDK的armv7和armv8的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
-```shell
-$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared full_publish
-$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared tiny_publish
-```
-
- full_publish and tiny_publish for armv7
+- 推荐编译命令。由于HiAI DDK的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
 ```shell
-$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared full_publish
-$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish
+# huawei_kirin_npu_sdk_root 需要指向 ai_ddk_lib 的路径
+$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=<path-to-ai_ddk_lib>
+# 其它选项可以通过 "./lite/tools/build_android.sh help" 查看，例如arm版本等 
 ```

 注意：为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
@@ -166,15 +166,15 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
 - 2、初步分析

    - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行，蓝色部分可能NPU上的性能不理想。此时，如果直接让预测库自动调度的话，可能会分成多个子图，而且整体性能不佳。因此，可以将蓝色部分和绿色部分整体指定在CPU上运行，让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
-    ![ssd_mobilenet_v1_example](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
+    ![](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)

 - 3、使用opt转换模型

    - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
-    ![image](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
-    ![image](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
+    ![](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
+    ![](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
    - 将从```digraph G```开始的，到```// end G```结束的整段模型图信息，保存到```.dot```格式的文件中。可以用```graphviz```打开查看，或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
-    ![image](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
+    ![](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
    - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在，如果被融合为了一个算子，需要指定此时融合后的算子)。

 - 4、写配置文件
@@ -186,7 +186,7 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
    concat
    softmax
    ```
-    - 由于这些算子都指定在NPU上运行，因此不需要特意配置算子的输入输出名称。
+    - 由于这些算子都指定在CPU上运行，因此不需要特意配置算子的输入输出名称。

 - 5、指定配置文件路径


--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
+# Python Demo
+
+## 1. 下载最新版本python预测库
+
+```shell
+python -m pip install paddlelite
+```
+
+## 2. 转化模型
+
+PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。
+
+以`mobilenet_v1`模型为例：
+
+（1）下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压：
+
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxf mobilenet_v1.tar.gz
+```
+
+（2）使用opt工具：
+
+ 从磁盘加载模型时，根据模型和参数文件存储方式不同，加载模型和参数的路径有两种形式。
+
+- Linux环境
+  - 非combined形式：模型文件夹model_dir下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。
+
+  ```shell
+  paddle_lite_opt --model_dir=./mobilenet_v1  \
+                  --optimize_out=mobilenet_v1_opt \
+                  --optimize_out_type=naive_buffer \
+                  --valid_targets=x86
+  ```
+  - combined形式：模型文件夹model_dir下只有一个模型文件__model__和一个参数文件__params__时，传入模型文件和参数文件路径
+
+  ```shell
+  paddle_lite_opt --model_file=./mobilenet_v1/__model__ \
+                  --param_file=./mobilenet_v1/__params__  \
+                  --optimize_out=mobilenet_v1_opt \
+                  --optimize_out_type=naive_buffer \
+                  --valid_targets=x86
+  ```
+
+- windows环境
+
+windows 暂不支持命令行方式直接运行模型转换器，需要编写python脚本
+
+```python
+import paddlelite.lite as lite
+
+a=lite.Opt()
+# 非combined形式
+a.set_model_dir("D:\\YOU_MODEL_PATH\\mobilenet_v1")
+
+# conmbined形式
+# a.set_model_file("D:\\YOU_MODEL_PATH\\mobilenet_v1\\__model__")
+# a.set_param_file("D:\\YOU_MODEL_PATH\\mobilenet_v1\\__params__")
+
+a.set_optimize_out("mobilenet_v1_opt")
+a.set_valid_places("x86")
+
+a.run()
+```
+
+- MAC 环境
+
+Opt工具使用方式同Linux（MAC环境暂不支持python端预测，下个版本会修复该问题）
+
+## 3. 编写预测程序
+
+准备好预测库和模型，我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考，创建文件mobilenetV1_light_api.py，
+python demo 完整代码位于 [demo/python](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/python/mobilenetv1_light_api.py) 。
+
+(1) 设置config信息
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
+```
+
+(2) 创建predictor
+
+```python
+predictor = create_paddle_predictor(config)
+```
+
+(3) 设置输入数据
+```python
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+```
+
+(4) 执行预测
+```python
+predictor.run()
+```
+
+(5) 得到输出数据
+```python
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+## 4. 运行文件
+```shell
+python mobilenetV1_light_api.py
+```
--- a/docs/demo_guides/x86.md
+++ b/docs/demo_guides/x86.md
@@ -4,8 +4,6 @@

 Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。

-(注意：非docker Linux环境需要是Ubuntu16.04)
-
 ### 编译

 1、 下载代码
@@ -20,10 +18,11 @@ git checkout release/v2.6.0

 ```bash
 cd Paddle-Lite
-./lite/tools/build.sh x86
+./lite/tools/build.sh --build_python=ON x86

 # 其他可选择编译选项
 # --with_log=OFF 关闭LOG信息输出
+# --build_python=OFF 编译python预测库
 ```

 ### 编译结果说明
@@ -53,8 +52,17 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
 - `mobilenetv1_full` ：使用full_api 执行mobilenet_v1预测的C++ demo
 - `mobilenetv1_light` ：使用light_api 执行mobilenet_v1预测的C++ demo

+5、 `demo/python`文件夹：x86预测库的Python 示例demo
+
+- `mobilenetv1_full_api.py` ：使用full_api 执行mobilenet_v1预测的Python demo
+- `mobilenetv1_light_api.py` ：使用light_api 执行mobilenet_v1预测的Python demo

+6、 `python`文件夹：包含python的库文件和对应的.whl包

+- `install`文件夹：编译成功的.whl包位于`install/dist/*.whl`
+- `lib`文件夹：.whl包依赖的库文件
+
+**(若不需要编译python预测库，则将编译命令替换为`./lite/tools/build.sh x86`)**

 ### x86预测API使用示例

@@ -64,7 +72,8 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
 mobilenetv1_full/
 |-- CMakeLists.txt
 |-- build.sh
-`-- mobilenet_full_api.cc
+|-- build.bat
+-- mobilenet_full_api.cc
 ```

 本demo使用cmake构建`CMakeLists.txt`为cmake脚本，`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
@@ -168,8 +177,8 @@ int main(int argc, char** argv) {
 #### 编译环境需求

 - Windows 10 专业版
-  - 目前Windows暂不支持GPU模式
- *Python 版本 2.7/3.5.1+/3.6/3.7 (64 bit)*
+  - 目前Windows暂不支持GPU编译
+- *Python 版本 2.7/3.5.1+ (64 bit)*
 - *pip 或 pip3 版本 9.0.1+ (64 bit)*
 - *Visual Studio 2015 Update3*

@@ -187,15 +196,15 @@ int main(int argc, char** argv) {
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
 # 切换到release分支
-git checkout release/v2.3
+git checkout release/v2.6.0
 ```
-2、 源码编译
+2、 源码编译(需要按照提示输入对应的参数)

-```bash
+```dos
 cd Paddle-Lite
-lite/tools/build_windows.bat with_extra with_python with_profile
+lite\tools\build_windows.bat with_extra with_python with_profile
 ```
-编译脚本`lite/tools/build.bat`，追加参数说明：
+编译脚本`build_windows.bat`，追加参数说明：

 |   参数     |     介绍     |     值     |
 |-----------|-------------|-------------|
@@ -203,40 +212,62 @@ lite/tools/build_windows.bat with_extra with_python with_profile
 |  with_python | 可选，是否编译python预测库（默认为OFF） 。 | `ON`、`OFF` |
 |  with_profile | 可选，是否支持分析器模式（默认为OFF） 。 | `ON`、`OFF` |

-### 编译结果
+### 编译结果说明

 x86编译结果位于 `build.lite.x86/inference_lite_lib`
 **具体内容**说明：

-1、 `bin`文件夹：可执行工具文件 `test_model_bin`
-
-2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+1、 `cxx`文件夹：包含c++的库文件与相应的头文件

 - `include`  : 头文件
 - `lib` : 库文件
-  - 打包的静态库文件：
+  - 静态库文件：
    - `libpaddle_api_full_bundled.lib`  ：full_api 静态库
    - `libpaddle_api_light_bundled.lib` ：light_api 静态库

-3、 `third_party` 文件夹：第三方库文件
+2、 `third_party` 文件夹：依赖的第三方预测库mklml
+
+- mklml : Paddle-Lite预测库依赖的mklml数学库
+
+3、 `demo/cxx`文件夹：x86预测库的C++ 示例demo
+
+- `mobilenetv1_full` ：使用full_api 执行mobilenet_v1预测的C++ demo
+- `mobilenetv1_light` ：使用light_api 执行mobilenet_v1预测的C++ demo
+
+4、 `demo/python`: x86预测库的Python示例demo
+
+- `mobilenetv1_full_api.py`:使用full_api 执行mobilenet_v1预测的Python demo
+- `mobilenetv1_light_api.py`:使用full_api 执行mobilenet_v1预测的Python demo

+5、 `python`文件夹：包含python的库文件和对应的.whl包
+
+- `install`文件夹：编译成功的.whl包位于`install/dist/*.whl`
+- `lib`文件夹：.whl包依赖的库文件
 ### x86预测API使用示例

-1、我们提供Windows环境下x86 API运行mobilenet_v1的示例：[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下>：
+1、`mobilenetv1_full`目录结构

-![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
+```bash
+mobilenetv1_full/
+|-- CMakeLists.txt
+|-- build.sh
+|-- build.bat
+`-- mobilenet_full_api.cc
+```

-`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.bat`为编译的脚本。
+本demo使用cmake构建`CMakeLists.txt`为cmake脚本，`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为Linux x86编译的脚本，`build.bat`为windows x86编译脚本。

-2、demo内容与使用方法
+2、demo使用方法

 ``` bash
-# 1、编译(需在vs2015的命令窗口执行该脚本)
+# 1、编译
+cd mobilenetv1_full
 build.bat
+cd build
 ```
-编译结果为当前目录下的 `Release\\mobilenet_full_api.exe`
-``` bash
+编译结果为当前目录下的 `Release\mobilenet_full_api.exe `
+``` dos
 # 2、执行预测
-Release\\mobilenet_full_api.exe ..\mobilenet_v1
+Release\mobilenet_full_api.exe mobilenet_v1
 ```
-`mobilenet_v1`为模型路径，`mobilenet_full_api.exe`为第一步编译出的可执行文件。
+下载并解压模型[`mobilenet_v1`](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)到当前`build`目录，执行以上命令进行预测。
--- a/docs/develop_guides/add_layout.md
+++ b/docs/develop_guides/add_layout.md
@@ -165,9 +165,7 @@ std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
 // 该文件第2处
 // 找到文件中的下面的函数
 KernelRegistry::KernelRegistry()
-    : registries_(static_cast<int>(TARGET(NUM)) *
-                  static_cast<int>(PRECISION(NUM)) *
-                  static_cast<int>(DATALAYOUT(NUM)))
+    : registries_() {

 // 在该函数中加入新增Layout的下面内容
  INIT_FOR(kOpenCL, kFP16, kNCHW);

--- a/docs/develop_guides/add_operation.md
+++ b/docs/develop_guides/add_operation.md
@@ -27,6 +27,28 @@
        bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
        void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
        std::string DebugString() const override { return "argmax"; }
+
+    #ifdef LITE_WITH_PROFILE
+        void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+            auto input_dims = param_.X->dims();
+            auto output_dims = param_.Out->dims();
+            ch->input_shape = ch->DimToStr(input_dims);
+            ch->output_shape = ch->DimToStr(output_dims);
+            ch->remark = "axis" + std::to_string(param_.Axis);
+
+            auto axis = param_.Axis;
+            if (axis < 0) {
+                axis += input_dims.size();
+            }
+            int max_num = 1;
+            for (int64_t i = axis + 1; i < input_dims.size(); i++)
+                max_num *= input_dims[i];
+            float gops = 1.0f;
+            for (int i = 1; i <= max_num; i++) gops *= i;
+            ch->macs = gops * output_dims.production();
+        }
+    #endif
+
    private:
        mutable ArgmaxParam param_;
    };
@@ -85,6 +107,13 @@
        using param_t = operators::ArgmaxParam;
        void Run() override;
        virtual ~ArgmaxCompute() = default;
+    #ifdef LITE_WITH_PROFILE
+        virtual void SetProfileRuntimeKernelInfo(
+            paddle::lite::profile::OpCharacter* ch) {
+            ch->kernel_func_name = kernel_func_name_;
+        }
+        std::string kernel_func_name_{"NotImplForArgmax"};
+    #endif
    };
    ```
 - 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件，主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数，根据输入计算输出。最后在argmax_compute.cc文件中，我们绑定argmax的输入输出（为tensor的输入参数都需要绑定），代码如下：
@@ -95,6 +124,9 @@
        lite::Tensor* output = param.Out;
        int axis = param.Axis;
        lite::arm::math::argmax_func(input, axis, output);
+    #ifdef LITE_WITH_PROFILE
+        kernel_func_name_ = "argmax_func";
+    #endif
        return;
    }


--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,8 +47,10 @@ Welcome to Paddle-Lite's documentation!

  demo_guides/cpp_demo
  demo_guides/java_demo
+  demo_guides/python_demo
  demo_guides/android_app_demo
  demo_guides/ios_app_demo
+  demo_guides/linux_arm_demo
  demo_guides/x86
  demo_guides/cuda
  demo_guides/opencl

--- a/docs/user_guides/Compile/Android.md
+++ b/docs/user_guides/Compile/Android.md
@@ -65,7 +65,7 @@ inference_lite_lib.android.armv8/
 ```shell
 --arch: (armv8|armv7)        arm版本，默认为armv8
 --toolchain: (gcc|clang)     编译器类型，默认为gcc
--android_stl: (c++_static|c++_shared|gnu_static|gnu_shared)   NDK stl库链接方法，默认为静态链接c++_static
+--android_stl: (c++_static|c++_shared)   NDK stl库链接方法，默认为静态链接c++_static
 --with_java: (OFF|ON)        是否编译Java预测库, 默认为 ON
 --with_cv: (OFF|ON)          是否编译CV相关预处理库, 默认为 OFF
 --with_log: (OFF|ON)         是否输出日志信息, 默认为 ON

--- a/docs/user_guides/Compile/Linux.md
+++ b/docs/user_guides/Compile/Linux.md
@@ -70,6 +70,7 @@ inference_lite_lib.armlinux.armv8/
 --with_cv: (OFF|ON)             是否编译CV相关预处理库, 默认为 OFF
 --with_log: (OFF|ON)            是否输出日志信息, 默认为 ON
 ```
+**注意：with_python现在仅支持armlinux的本地编译，尚不支持docker环境和ubuntu环境**

 - 裁剪预测库方法（只编译模型中的kernel&OP，降低预测库体积）:


--- a/docs/user_guides/library.md
+++ b/docs/user_guides/library.md

-# `build_extra`参数说明：
+# `with_extra`参数说明：

 Lite预测库分为**基础预测库**和**全量预测库(with_extra)**：基础预测库只包含基础CV算子（OP），体积较小；全量预测库包含所有Lite算子，体积较大，支持模型较多。


--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -5,168 +5,57 @@ Paddle-Lite 提供了多种策略来自动优化原始的训练模型，其中

 具体使用方法介绍如下：

-**注意**：`v2.2.0` 之前的模型转化工具名称为`model_optimize_tool`，从 `v2.3` 开始模型转化工具名称修改为 `opt`
+**注意**：`v2.2.0` 之前的模型转化工具名称为`model_optimize_tool`，从 `v2.3` 开始模型转化工具名称修改为 `opt`，从`v2.6.0`开始支持python调用`opt`转化模型（Windows/Ubuntu/Mac）

 ## 准备opt
-当前获得opt方法有三种：
+当前获得`opt`工具的方法有三种：

-1. **推荐！** 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
-   (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
-2. 本文提供`release/v2.3`和`release/v2.2.0`版本的优化工具下载
+- 方法一: 安装opt的python版本

-|版本 | Linux | MacOS|
-|---|---|---|
-| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
-|`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
-
-
-3. 如果 release 列表里的工具不符合您的环境，可以下载Paddle-Lite 源码，源码编译出opt工具
-```bash
-git clone https://github.com/PaddlePaddle/Paddle-Lite.git
-cd Paddle-Lite
-git checkout <release-version-tag>
-./lite/tools/build.sh build_optimize_tool
-```
-编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
-**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](source_compile)。
-
-## 使用opt
-
-opt是 x86 平台上的可执行文件，需要在PC端运行：支持Linux终端和Mac终端。
+安装`paddlelite` python库，安装成功后调用opt转化模型（支持`windows\Mac\Ubuntu`）

-### 帮助信息
- 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
 ```bash
- ./opt
+pip install paddlelite
 ```
-![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)

-### 功能一：转化模型为Paddle-Lite格式
-opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式，期间执行的操作包括：
+- 方法二: 下载opt可执行文件
+从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择当前预测库对应版本的`opt`转化工具

- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积
- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等效果
+本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载

-模型优化过程：
-
-（1）准备待优化的PaddlePaddle模型
-
-PaddlePaddle模型有两种保存格式：
-   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
-
-![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
-
-   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
-![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+|版本 | Linux | MacOS|
+|---|---|---|
+| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+|`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |

-(2) 终端中执行`opt`优化模型
-**使用示例**：转化`mobilenet_v1`模型
+- 方法三: 源码编译opt
+源码编译 opt 可执行文件

 ```
-./opt --model_dir=./mobilenet_v1 \
-      --valid_targets=arm \
-      --optimize_out_type=naive_buffer \
-      --optimize_out=mobilenet_v1_opt
+cd Paddle-Lite && ./lite/tools/build.sh build_optimize_tool
 ```
-以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：

-![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+编译结果位于`build.opt/lite/api/`下的可执行文件`opt`
+
+## 使用opt

+当前使用`opt`工具转化模型的方法有以下三种：

-(3) **更详尽的转化命令**总结：
+- 方法一： [安装 python版本opt后，使用终端命令](./opt/opt_python) （支持Mac/Ubuntu)
+- 方法二： [安装python版本opt后，使用python脚本](../api_reference/python_api/opt)（支持window/Mac/Ubuntu）
+- 方法三：[直接下载并执行opt可执行工具](./opt/opt_bin)（支持Mac/Ubuntu)
+- Q&A：如何安装python版本opt ?

+可以通过以下命令安装paddlelite的python库(支持`windows/Mac/Ubuntu`)：
 ```shell
-./opt \
-    --model_dir=<model_param_dir> \
-    --model_file=<model_path> \
-    --param_file=<param_path> \
-    --optimize_out_type=(protobuf|naive_buffer) \
-    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86|npu|xpu) \
-    --record_tailoring_info =(true|false)
+pip install paddlelite
 ```

-| 选项         | 说明 |
-| ------------------- | ------------------------------------------------------------ |
-| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
-| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
-| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
-| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
-| --optimize_out      | 优化模型的输出路径。                                         |
-| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
-| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |

-* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
-* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
-* 优化后的模型为以`.nb`名称结尾的单个文件。
-* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。

-### 功能二：统计模型算子信息、判断是否支持
-
-opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
-
-（1）使用opt统计模型中算子信息
-
-下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
-
-`./opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
-
-![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png)
-
-（2）使用opt打印当前Paddle-Lite支持的算子信息
-
-`./opt --print_all_ops=true`
-
-以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
-
-![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png)
-
-`./opt ----print_supported_ops=true  --valid_targets=x86`
-
-以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
-
-![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png)
-
-## 其他功能：合并x2paddle和opt的一键脚本
+## 合并x2paddle和opt的一键脚本

 **背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
-为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
-
-**一键转化脚本**：[auto_transform.sh](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.3/lite/tools/auto_transform.sh)
-
-
-**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和x2paddle依赖项(tensorflow、caffe等)。
+为了简化这一过程，我们提供了：

-**使用方法**：
-
-（1）打印帮助帮助信息：` sh ./auto_transform.sh`
-
-（2）转化模型方法
-
-```bash
-USAGE:
-    auto_transform.sh combines the function of x2paddle and opt, it can 
-    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
----------------------------------------
-example:
-    sh ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
----------------------------------------
-Arguments about x2paddle:
-    --framework=(tensorflow|caffe|onnx);
-    --model='model file for tensorflow or onnx';
-    --prototxt='proto file for caffe' --weight='weight file for caffe'
- For TensorFlow:
-   --framework=tensorflow --model=tf_model.pb
-
- For Caffe:
-   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel
-
- For ONNX
-   --framework=onnx --model=onnx_model.onnx
-
-Arguments about opt:
-    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite.
-    --fluid_save_dir='path to outputed model after x2paddle'
-    --optimize_out='path to outputed Paddle-Lite model'
----------------------------------------
-```
+ [合并x2paddle和opt的一键脚本](./opt/x2paddle&opt)
--- a/docs/user_guides/opt/opt_bin.md
+++ b/docs/user_guides/opt/opt_bin.md
+## 使用opt转化模型
+
+opt是 x86 平台上的可执行文件，需要在PC端运行：支持Linux终端和Mac终端。
+
+### 帮助信息
+ 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+```bash
+ ./opt
+```
+![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
+
+### 功能一：转化模型为Paddle-Lite格式
+opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式，期间执行的操作包括：
+
+- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积
+- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等效果
+
+模型优化过程：
+
+（1）准备待优化的PaddlePaddle模型
+
+PaddlePaddle模型有两种保存格式：
+   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
+
+![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
+
+   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
+![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+
+(2) 终端中执行`opt`优化模型
+**使用示例**：转化`mobilenet_v1`模型
+
+```shell
+paddle_lite_opt --model_dir=./mobilenet_v1 \
+      --valid_targets=arm \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=mobilenet_v1_opt
+```
+以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
+
+![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+
+
+(3) **更详尽的转化命令**总结：
+
+```shell
+paddle_lite_opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86|npu|xpu) \
+    --record_tailoring_info =(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
+| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
+| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
+| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。
+
+### 功能二：统计模型算子信息、判断是否支持
+
+opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
+
+（1）使用opt统计模型中算子信息
+
+下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
+
+`./opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
+
+![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png)
+
+（2）使用opt打印当前Paddle-Lite支持的算子信息
+
+`./opt --print_all_ops=true`
+
+以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
+
+![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png)
+
+`./opt --print_supported_ops=true  --valid_targets=x86`
+
+以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
+
+![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png)
--- a/docs/user_guides/opt/opt_python.md
+++ b/docs/user_guides/opt/opt_python.md
+
+## python调用opt转化模型
+
+安装了paddle-lite 的python库后，可以通过python调用 opt 工具转化模型。（支持MAC&Ubuntu系统）
+
+### 安装Paddle-Lite
+
+```
+pip install paddlelite
+```
+
+### 帮助信息
+安装成功后可以查看帮助信息
+```bash
+ paddle_lite_opt
+```
+![](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/help.jpg)
+
+### 功能一：转化模型为Paddle-Lite格式
+opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式，期间执行的操作包括：
+
+- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积
+- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等效果
+
+模型优化过程：
+
+（1）准备待优化的PaddlePaddle模型
+
+PaddlePaddle模型有两种保存格式：
+   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
+
+![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
+
+   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
+![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+
+(2) 终端中执行`opt`优化模型
+**使用示例**：转化`mobilenet_v1`模型
+
+```
+paddle_lite_opt --model_dir=./mobilenet_v1 \
+      --valid_targets=arm \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=mobilenet_v1_opt
+```
+以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
+
+![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+
+
+(3) **更详尽的转化命令**总结：
+
+```shell
+paddle_lite_opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86|npu|xpu) \
+    --record_tailoring_info =(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
+| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
+| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
+| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。
+
+### 功能二：统计模型算子信息、判断是否支持
+
+opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
+
+（1）使用opt统计模型中算子信息
+
+下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
+
+`paddle_lite_opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
+
+![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/check_model.png)
+
+（2）使用opt打印当前Paddle-Lite支持的算子信息
+
+`paddle_lite_opt --print_all_ops=true`
+
+以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
+
+![opt_print_allops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/print_op.png)
+
+`paddle_lite_opt --print_supported_ops=true  --valid_targets=x86`
+
+以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
+
+![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/print_x86op.png)
--- a/docs/user_guides/opt/x2paddle&opt.md
+++ b/docs/user_guides/opt/x2paddle&opt.md
+## 合并x2paddle和opt的一键脚本
+
+**背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
+为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
+
+**一键转化脚本**：[auto_transform.sh](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.3/lite/tools/auto_transform.sh)
+
+
+**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和x2paddle依赖项(tensorflow、caffe等)。
+
+**使用方法**：
+
+（1）打印帮助帮助信息：` sh ./auto_transform.sh`
+
+（2）转化模型方法
+
+```bash
+USAGE:
+    auto_transform.sh combines the function of x2paddle and opt, it can 
+    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
+----------------------------------------
+example:
+    sh ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
+----------------------------------------
+Arguments about x2paddle:
+    --framework=(tensorflow|caffe|onnx);
+    --model='model file for tensorflow or onnx';
+    --prototxt='proto file for caffe' --weight='weight file for caffe'
+ For TensorFlow:
+   --framework=tensorflow --model=tf_model.pb
+
+ For Caffe:
+   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel
+
+ For ONNX
+   --framework=onnx --model=onnx_model.onnx
+
+Arguments about opt:
+    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite.
+    --fluid_save_dir='path to outputed model after x2paddle'
+    --optimize_out='path to outputed Paddle-Lite model'
+----------------------------------------
+```
--- a/docs/user_guides/post_quant_no_data.md
+++ b/docs/user_guides/post_quant_no_data.md
-# 模型量化-无校准数据训练后量化
+# 模型量化-动态离线量化

-本文首先简单介绍无校准数据训练后量化，然后说明产出量化模型，最后阐述量化模型预测。
+本文首先简单介绍动态离线量化，然后说明产出量化模型，最后阐述量化模型预测。

 ## 1 简介

-无校准数据训练后量化，将模型中特定OP的权重从FP32类型量化成INT8/16类型，可以减小预测模型的大小。使用该量化模型预测，首先将INT8/16类型的权重反量化成FP32类型，然后再进行预测。
+动态离线量化，将模型中特定OP的权重从FP32类型量化成INT8/16类型。
+
+该量化模型有两种预测方式：第一种是反量化预测方式，即是首先将INT8/16类型的权重反量化成FP32类型，然后再使用FP32浮运算运算进行预测；第二种量化预测方式，即是预测中动态计算量化OP输入的量化信息，基于量化的输入和权重进行INT8整形运算。
+
+注意，目前PaddleLite仅仅支持第一种反量化预测方式。

 使用条件：
 * 有训练好的预测模型

 使用步骤：
-* 产出量化模型：使用PaddlePaddle调用无校准数据训练后量化接口，产出量化模型
+* 产出量化模型：使用PaddlePaddle调用动态离线量化离线量化接口，产出量化模型
 * 量化模型预测：使用PaddleLite加载量化模型进行预测推理

 优点：
@@ -18,11 +22,11 @@
 * 权重量化成INT8类型，模型精度会受到影响，模型大小为原始的1/4

 缺点：
-* 只可以减小模型大小，不能加快模型推理
+* 目前只支持反量化预测方式，主要可以减小模型大小，对特定加载权重费时的模型可以起到一定加速效果

 ## 2 产出量化模型

-大家可以使用PaddlePaddle调用无校准数据训练后量化接口，得到量化模型。
+目前该方法还没有在PaddleSlim中集成，大家可以使用PaddlePaddle调用动态离线量化接口，得到量化模型。

 ### 2.1 安装PaddlePaddle

@@ -32,9 +36,9 @@

 准备已经训练好的FP32预测模型，即 `save_inference_model()` 保存的模型。

-### 2.3 调用无校准数据训练后量化
+### 2.3 调用动态离线量化

-对于调用无校准数据训练后量化，首先给出一个例子。
+对于调用动态离线量化，首先给出一个例子。

 ```python
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
@@ -52,7 +56,7 @@ weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir,
 执行完成后，可以在 `save_model_dir/quantized_model` 目录下得到量化模型。


-对于调用无校准数据训练后量化，以下对api接口进行详细介绍。
+对于调用动态离线量化，以下对api接口进行详细介绍。

 ```python
 class WeightQuantization(model_dir, model_filename=None, params_filename=None)
@@ -85,11 +89,11 @@ WeightQuantization.quantize_weight_to_int(self,

 ## 3 量化模型预测

-目前，对于无校准数据训练后量化产出的量化模型，只能使用PaddleLite进行预测部署。
+目前，对于动态离线量化产出的量化模型，只能使用PaddleLite进行预测部署。

 很简单，首先使用PaddleLite提供的模型转换工具（opt）将量化模型转换成移动端预测的模型，然后加载转换后的模型进行预测部署。

-注意，PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化，所以转换工具和预测库必须是2.3及之后的版本。
+注意，PaddleLite 2.3版本才支持动态离线量化产出的量化，所以转换工具和预测库必须是2.3及之后的版本。

 ### 3.1 模型转换


--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
-# 模型量化-有校准数据训练后量化
+# 模型量化-静态离线量化

 ## 1 简介

-有校准数据训练后量化，使用少量校准数据计算量化因子，可以快速得到量化模型。使用该量化模型进行预测，可以减少计算量、降低计算内存、减小模型大小。
+静态离线量化，使用少量校准数据计算量化因子，可以快速得到量化模型。使用该量化模型进行预测，可以减少计算量、降低计算内存、减小模型大小。

-有校准数据训练后量化中，有两种计算量化因子的方法，非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`，将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0<T<mab_max`)，将其映射为127。一般而言，待量化Op的权重采用非饱和量化方法，待量化Op的激活（输入和输出）采用饱和量化方法 。
+静态离线量化中，有两种计算量化因子的方法，非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`，将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0<T<mab_max`)，将其映射为127。一般而言，待量化Op的权重采用非饱和量化方法，待量化Op的激活（输入和输出）采用饱和量化方法 。

 使用条件：
 * 有训练好的预测模型
 * 有少量校准数据，比如100~500张图片

 使用步骤：
-* 产出量化模型：使用PaddleSlim调用有校准数据训练后量化接口，产出量化模型
+* 产出量化模型：使用PaddleSlim调用静态离线量化接口，产出量化模型
 * 量化模型预测：使用PaddleLite加载量化模型进行预测推理

 优点：
@@ -24,7 +24,7 @@

 ## 2 产出量化模型

-大家可以使用PaddleSlim调用有校准数据训练后量化接口，得到量化模型。
+大家可以使用PaddleSlim调用静态离线量化接口，得到量化模型。

 ### 2.1 安装PaddleSlim

@@ -37,12 +37,12 @@

 ### 2.3 配置校准数据生成器

-有校准数据训练后量化内部使用异步数据读取的方式读取校准数据，大家只需要根据模型的输入，配置读取数据的sample_generator。sample_generator是Python生成器，**必须每次返回单个样本数据**，会用作`DataLoader.set_sample_generator()`的数据源。
+静态离线量化内部使用异步数据读取的方式读取校准数据，大家只需要根据模型的输入，配置读取数据的sample_generator。sample_generator是Python生成器，**必须每次返回单个样本数据**，会用作`DataLoader.set_sample_generator()`的数据源。
 建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。

-### 2.4 调用有校准数据训练后量化
+### 2.4 调用静态离线量化

-对于调用有校准数据训练后量化，首先给出一个例子，让大家有个直观了解。
+对于调用静态离线量化，首先给出一个例子，让大家有个直观了解。

 ```python
 import paddle.fluid as fluid

--- a/docs/user_guides/release_lib.md
+++ b/docs/user_guides/release_lib.md
@@ -3,59 +3,70 @@

 ## 编译版本介绍

- ARM_Version=`armv7/armv8`                        arm版本，可选择armv7或者armv8
+- arch=`armv7/armv8`                                       arm版本，可选择armv7或者armv8
+- arm_os=`android\ios\armlinux`    安装平台，支持的arm端移动平台包括 `ios`、`armlinux`和`android`
+- toolchain=`gcc/clang`                                 源码编译时的编译器，默认为`gcc`编译器
+- android_stl=`c++_static/c++_shared`     Lite预测库链接STL库的方式，支持静态或动态链接
+- with_extra=`ON/OFF`                                     是否编译全量OP，OFF时只编译CV相关基础OP，[参数详情](library)
+- with_cv=`ON/OFF`                                          是否编译编译Paddle-Lite CV 相关API

- arm_os=`android\ios\ios64\armlinux`   安装平台，支持的arm端移动平台包括 `ios\ios64`、`armlinux`和`android`

- arm_lang=`gcc/clang`                                  源码编译时的编译器，默认为`gcc`编译器
+## Android（toolchain=gcc）

- arm_stl=`c++_static/c++_shared`             Lite预测库链接STL库的方式，支持静态或动态链接
-
- build_extra=`ON/OFF`                                     是否编译全量OP，OFF时只编译CV相关基础OP，[参数详情](library)
-
-  `tiny_publish/full_publish`                   编译模式，`tiny_publish`编译移动端部署库、`full_publish`编译部署库的同时编译第三方依赖库
-
-
-## Android
-
-|ARM Version|build_extra|arm_stl|target|下载|
+| Arch  |with_extra|arm_stl|with_cv|下载|
 |:-------:|:-----:|:-----:|:-----:|:-------:|
-|armv7|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.tiny_publish.tar.gz)|
-|armv7|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.full_publish.tar.gz)|
-|armv7|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.tiny_publish.tar.gz)|
-|armv7|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.full_publish.tar.gz)|
-|armv7|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
-|armv7|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.full_publish.tar.gz)|
-|armv7|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
-|armv7|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.full_publish.tar.gz)|
-|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
-|armv8|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.full_publish.tar.gz)|
-|armv8|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.tiny_publish.tar.gz)|
-|armv8|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.full_publish.tar.gz)|
-|armv8|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
-|armv8|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.full_publish.tar.gz)|
-|armv8|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
-|armv8|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.full_publish.tar.gz)|
+|armv7|OFF|c++_shared|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_shared.CV_OFF.tar.gz)|
+|armv7|OFF|c++_shared|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_shared.CV_ON.tar.gz)|
+|armv7|ON|c++_shared|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.CV_OFF.tar.gz)|
+|armv7|ON|c++_shared|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.CV_ON.tar.gz)|
+|armv7|OFF|c++_static|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_static.CV_OFF.tar.gz)|
+|armv7|OFF|c++_static|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_static.CV_ON.tar.gz)|
+|armv7|ON|c++_static|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.CV_OFF.tar.gz)|
+|armv7|ON|c++_static|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.CV_ON.tar.gz)|
+|armv8|OFF|c++_shared|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_shared.CV_OFF.tar.gz)|
+|armv8|OFF|c++_shared|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_shared.CV_ON.tar.gz)|
+|armv8|ON|c++_shared|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.CV_OFF.tar.gz)|
+|armv8|ON|c++_shared|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.CV_ON.tar.gz)|
+|armv8|OFF|c++_static|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_static.CV_OFF.tar.gz)|
+|armv8|OFF|c++_static|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_static.CV_ON.tar.gz)|
+|armv8|ON|c++_static|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.CV_OFF.tar.gz)|
+|armv8|ON|c++_static|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/Android/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.CV_ON.tar.gz)|


 ## iOS

-|ARM Version|arm_os|with_extra|下载|
+|ARM Version|with_extra|with_cv|下载|
 |:-------:|:-----:|:-----:|:-----:|
-|armv7|ios|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.tar.gz)|
-|armv7|ios|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.with_extra.tar.gz)|
-|armv8|ios64|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.tar.gz)|
-|armv8|ios64|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.with_extra.tar.gz)|
+|armv7|OFF|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios.armv7.CV_OFF.tar.gz)|
+|armv7|OFF|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios.armv7.CV_ON.tar.gz)|
+|armv7|ON|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios.armv7.with_extra.CV_OFF.tar.gz)|
+|armv7|ON|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios.armv7.with_extra.CV_ON.tar.gz)|
+|armv8|OFF|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios64.armv8.CV_OFF.tar.gz)|
+|armv8|OFF|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios64.armv8.CV_ON.tar.gz)|
+|armv8|ON|OFF|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios64.armv8.with_extra.CV_OFF.tar.gz)|
+|armv8|ON|ON|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/iOS/inference_lite_lib.ios64.armv8.with_extra.CV_ON.tar.gz)|
+
+## x86
+
+|Operating System|下载|
+|:-------:|:-----:|
+|Ubuntu (Linux)|[release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/X86/Linux/inference_lite_lib.x86.linux.tar.gz)|


 ## opt 工具

 | 运行系统 |      下载       |
 | :---------: |  :--------------: |
-|    Linux    | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) |
-|    MacOs   | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+|    Linux    | [release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) |
+|    MacOs   | [release/v2.6.1](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |

+## 安装Paddle-Lite python 库方法

+- 支持平台： windows10、Ubuntu、Mac
+- python version: 2.7、3.5、3.6、 3.7
+```
+pip install paddlelite
+```

 ## 对应源码编译方法


--- a/docs/user_guides/source_compile.md
+++ b/docs/user_guides/source_compile.md

-# 预测库编译
+# 源码编译方法

 PaddleLite已经提供官方Release预测库下载，请参考[文档](release_lib)。

@@ -10,11 +10,12 @@ PaddleLite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`

 ## 一、环境准备

-目前支持三种编译的环境：
+目前支持四种编译的环境：

 1. Docker 容器环境，
 2. Linux（推荐 Ubuntu 16.04）环境，
-3. Mac OS 环境。
+3. Mac OS 环境，
+4. [Windows 环境](../demo_guides/x86.html#windows)

 ### 1、 Docker开发环境

@@ -156,7 +157,7 @@ wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \

 ###### 编译环境要求

- gcc、g++、git、make、wget、python
+- gcc、g++、git、make、wget、python、pip、python-dev、patchelf
 - cmake（建议使用3.10或以上版本）

 ###### 具体步骤
@@ -167,7 +168,7 @@ wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
 # 1. Install basic software
 apt update
 apt-get install -y --no-install-recomends \
-  gcc g++ make wget python unzip
+  gcc g++ make wget python unzip patchelf python-dev

 # 2. install cmake 3.10 or above
 wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz

--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -13,11 +13,9 @@ Lite框架拥有强大的加速、优化策略及实现，其中包含诸如量

 opt的详细介绍，请您参考 [模型优化方法](model_optimize_tool) 。

-使用opt，您只需编译后在开发机上执行以下代码：
+下载opt工具后执行以下代码：

 ``` shell
-$ cd <PaddleLite_base_path>
-$ cd build.opt/lite/api/
 $ ./opt \
    --model_dir=<model_param_dir> \
    --model_file=<model_path> \

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -224,11 +224,11 @@ if (LITE_WITH_X86)
        add_dependencies(publish_inference publish_inference_x86_cxx_lib)

        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party/mklml"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install/mklml" "${INFER_LITE_PUBLISH_ROOT}/third_party/mklml"
            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
        )
        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
@@ -327,7 +327,6 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
-                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                endif()
            endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -46,7 +46,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
        target_link_libraries(paddle_light_api_shared shlwapi.lib)
    endif()
    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
-    if(NOT APPLE AND NOT WIN32)
+   if(APPLE)
+        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
+        set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
+        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+        add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+        set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_dependencies(paddle_full_api_shared custom_linker_map)
+   elseif(NOT WIN32)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
@@ -167,6 +174,7 @@ set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")

 if(WITH_TESTING)
+    if(NOT WITH_COVERAGE)
        lite_cc_test(test_cxx_api SRCS cxx_api_test.cc
           DEPS cxx_api mir_passes lite_api_test_helper
           ${ops} ${host_kernels}
@@ -186,6 +194,7 @@ if(WITH_TESTING)
           ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
        add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz)
+    endif()
    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        if(LITE_WITH_X86)
            lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
@@ -324,7 +333,8 @@ bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api

 # These tests needs CLI arguments, and is not supported in ARM CI.
 # TODO(Superjomn) support latter.
-lite_cc_test(test_light_api SRCS light_api_test.cc
+if(NOT WITH_COVERAGE)
+    lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -332,7 +342,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

-lite_cc_test(test_apis SRCS apis_test.cc
+    lite_cc_test(test_apis SRCS apis_test.cc
        DEPS cxx_api light_api ${ops} paddle_api_light
        CL_DEPS ${opencl_kernels}
        X86_DEPS ${x86_kernels}
@@ -343,6 +353,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+endif()

 if (LITE_WITH_JAVA AND LITE_WITH_ARM)
    add_subdirectory(android)
@@ -368,7 +379,8 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
    add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)

-lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
+if(NOT WITH_COVERAGE)
+    lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
      ${ops}
      ARM_DEPS ${arm_kernels}
      CV_DEPS paddle_cv_arm
@@ -382,8 +394,9 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
-if (WITH_TESTING)
+    if (WITH_TESTING)
        add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
+    endif()
 endif()

 # Some bins

--- a/lite/api/android/jni/native/convert_util_jni.h
+++ b/lite/api/android/jni/native/convert_util_jni.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once

 #include <jni.h>
-#include <string>
-#include <vector>
+#include <string>  // NOLINT
+#include <vector>  // NOLINT

 #include "lite/api/light_api.h"
 #include "lite/api/paddle_api.h"
@@ -78,6 +78,14 @@ inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env,
  return result;
 }

+inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env,
+                                          const int8_t *buf,
+                                          int64_t len) {
+  jbyteArray result = env->NewByteArray(len);
+  env->SetByteArrayRegion(result, 0, len, buf);
+  return result;
+}
+
 inline jintArray cpp_array_to_jintarray(JNIEnv *env,
                                        const int *buf,
                                        int64_t len) {
@@ -86,11 +94,11 @@ inline jintArray cpp_array_to_jintarray(JNIEnv *env,
  return result;
 }

-inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env,
-                                          const int8_t *buf,
+inline jlongArray cpp_array_to_jlongarray(JNIEnv *env,
+                                          const int64_t *buf,
                                          int64_t len) {
-  jbyteArray result = env->NewByteArray(len);
-  env->SetByteArrayRegion(result, 0, len, buf);
+  jlongArray result = env->NewLongArray(len);
+  env->SetLongArrayRegion(result, 0, len, buf);
  return result;
 }


--- a/lite/api/android/jni/native/tensor_jni.cc
+++ b/lite/api/android/jni/native/tensor_jni.cc
@@ -136,6 +136,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
  return JNI_TRUE;
 }

+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3L(
+    JNIEnv *env, jobject jtensor, jlongArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+
+  int64_t *input = (*tensor)->mutable_data<int64_t>();
+  env->GetLongArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
+
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
  if (is_const_tensor(env, jtensor)) {
@@ -178,6 +194,20 @@ Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *env, jobject jtensor) {
  }
 }

+JNIEXPORT jlongArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getLongData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jlongarray(
+        env, (*tensor)->data<int64_t>(), product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jlongarray(
+        env, (*tensor)->data<int64_t>(), product((*tensor)->shape()));
+  }
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
    JNIEnv *env, jobject jtensor, jlong java_pointer) {
  if (java_pointer == 0) {

--- a/lite/api/android/jni/native/tensor_jni.h
+++ b/lite/api/android/jni/native/tensor_jni.h
@@ -57,6 +57,14 @@ Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
 JNIEXPORT jintArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *, jobject);

+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getLongData
+ * Signature: ()[L
+ */
+JNIEXPORT jlongArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getLongData(JNIEnv *, jobject);
+
 /*
 * Class:     com_baidu_paddle_lite_Tensor
 * Method:    nativeResize
@@ -89,6 +97,14 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
    JNIEnv *, jobject, jintArray);

+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    nativeSetData
+ * Signature: ([L)Z
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3L(
+    JNIEnv *, jobject, jlongArray);
+
 /*
 * Class:     com_baidu_paddle_lite_Tensor
 * Method:    deleteCppTensor

--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
@@ -141,6 +141,11 @@ public class Tensor {
     */
    public native int[] getIntData();

+    /**
+     * @return the tensor data as long array.
+     */
+    public native long[] getLongData();
+
    private native boolean nativeResize(long[] dims);

    private native boolean nativeSetData(float[] buf);
@@ -149,6 +154,8 @@ public class Tensor {

    private native boolean nativeSetData(int[] buf);

+    private native boolean nativeSetData(long[] buf);
+
    /**
     * Delete C++ Tenor object pointed by the input pointer, which is presented by a
     * long value.

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -91,6 +91,8 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
  }
  std::vector<Place> vaild_places = {
      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kInt32)},
+      Place{TARGET(kARM), PRECISION(kInt64)},
  };
  config.set_valid_places(vaild_places);
  auto predictor = lite_api::CreatePaddlePredictor(config);
@@ -161,7 +163,7 @@ void Run(const std::vector<int64_t>& input_shape,
    auto end = GetCurrentUS();
    perf_vct.push_back((end - start) / 1000.0);
  }
-  std::sort(perf_vct.begin(), perf_vct.end());
+  std::stable_sort(perf_vct.begin(), perf_vct.end());
  float min_res = perf_vct.back();
  float max_res = perf_vct.front();
  float total_res = accumulate(perf_vct.begin(), perf_vct.end(), 0.0);

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -327,8 +327,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
    }
  }
  if (is_quantized_model) {
+#ifdef LITE_WITH_ARM
    inner_places.insert(inner_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});
+#endif
  }

  Program program(desc, scope_, inner_places);

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -35,7 +35,7 @@ namespace lite {
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  config_ = config;
  auto places = config.valid_places();
-  std::vector<std::string> passes{};
+  std::vector<std::string> passes = config.get_passes_internal();
 #ifdef LITE_WITH_CUDA
  // if kCUDA is included in valid places, it should be initialized first,
  // otherwise skip this step.

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -14,7 +14,7 @@

 #include "lite/api/light_api.h"
 #include <algorithm>
-#include <unordered_map>
+#include <map>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT


--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -36,6 +36,11 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  }
  mode_ = config.power_mode();
  threads_ = config.threads();
+
+#ifdef LITE_WITH_NPU
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 }

 std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInput(int i) {

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <gflags/gflags.h>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -25,6 +24,7 @@
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
+#include <gflags/gflags.h>

 using paddle::lite::profile::Timer;

@@ -34,6 +34,10 @@ DEFINE_string(input_shape,
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
+DEFINE_string(backend,
+              "arm_cpu",
+              "choose backend for valid_places: arm_cpu | opencl. Compile "
+              "OpenCL version if you choose opencl");
 DEFINE_string(arg_name, "", "the arg name");

 namespace paddle {
@@ -49,9 +53,19 @@ void OutputOptModel(const std::string& load_model_dir,
                           Place{TARGET(kX86), PRECISION(kInt64)},
                           Place{TARGET(kHost), PRECISION(kFloat)}});
 #else
+  if (FLAGS_backend == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
+    });
+  } else {  // arm_cpu
    config.set_valid_places({
        Place{TARGET(kARM), PRECISION(kFloat)},
    });
+  }
 #endif
  auto predictor = lite_api::CreatePaddlePredictor(config);

@@ -117,16 +131,40 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
            << ", min time: " << ti.LapTimes().Min() << " ms"
            << ", max time: " << ti.LapTimes().Max() << " ms.";

-  auto output = predictor->GetOutput(0);
-  auto out = output->data<float>();
-  LOG(INFO) << "out " << out[0];
-  LOG(INFO) << "out " << out[1];
-  auto output_shape = output->shape();
-  int output_num = 1;
-  for (int i = 0; i < output_shape.size(); ++i) {
-    output_num *= output_shape[i];
+  // output summary
+  size_t output_tensor_num = predictor->GetOutputNames().size();
+  LOG(INFO) << "output tensor num:" << output_tensor_num;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    auto output_tensor = predictor->GetOutput(tidx);
+    LOG(INFO) << "============= output tensor " << tidx << " =============";
+    auto tensor_shape = output_tensor->shape();
+    std::string tensor_shape_str{""};
+    int output_tensor_numel = 1;
+    for (int i = 0; i < tensor_shape.size(); ++i) {
+      output_tensor_numel *= tensor_shape[i];
+      tensor_shape_str += std::to_string(tensor_shape[i]);
+      tensor_shape_str += (i < tensor_shape.size() - 1) ? "x" : "";
+    }
+    auto out_data = output_tensor->data<float>();
+    auto out_mean =
+        paddle::lite::compute_mean<float>(out_data, output_tensor_numel);
+    auto out_std_dev = paddle::lite::compute_standard_deviation<float>(
+        out_data, output_tensor_numel, true, out_mean);
+
+    LOG(INFO) << "output tensor " << tidx << " dims:" << tensor_shape_str;
+    LOG(INFO) << "output tensor " << tidx
+              << " elements num:" << output_tensor_numel;
+    LOG(INFO) << "output tensor " << tidx
+              << " standard deviation:" << out_std_dev;
+    LOG(INFO) << "output tensor " << tidx << " mean value:" << out_mean << "\n";
+
+    // print result
+    for (int i = 0; i < output_tensor_numel; ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
+    }
  }
-  LOG(INFO) << "output_num: " << output_num;

  // please turn off memory_optimize_pass to use this feature.
  if (FLAGS_arg_name != "") {
@@ -162,6 +200,7 @@ int main(int argc, char** argv) {
              << "--model_dir /path/to/your/model";
    exit(0);
  }
+
  std::string save_optimized_model_dir = "";
  if (FLAGS_use_optimize_nb) {
    save_optimized_model_dir = FLAGS_model_dir;

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -92,6 +92,10 @@ std::vector<Place> ParserValidPlaces() {
          Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)});
      valid_places.emplace_back(
          Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kARM), PRECISION(kInt64), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kARM), PRECISION(kAny), DATALAYOUT(kNCHW)});
    } else if (target_repr == "opencl") {
      valid_places.emplace_back(
          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
@@ -367,7 +371,7 @@ void CheckIfModelSupported() {
    for (size_t i = 0; i < valid_places.size(); i++) {
      targets.push_back(valid_places[i].target);
    }
-    std::sort(targets.begin(), targets.end());
+    std::stable_sort(targets.begin(), targets.end());
    targets.erase(unique(targets.begin(), targets.end()), targets.end());
    std::string targets_str = TargetToStr(targets[0]);
    for (size_t i = 1; i < targets.size(); i++) {

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -40,12 +40,24 @@ void OptBase::SetModelType(std::string optimize_out_type) {
  }
 }

+void OptBase::SetPassesInternal(
+    const std::vector<std::string>& passes_internal) {
+  opt_config_.set_passes_internal(passes_internal);
+}
+
 void OptBase::SetValidPlaces(const std::string& valid_places) {
  valid_places_.clear();
  auto target_reprs = lite::Split(valid_places, ",");
  for (auto& target_repr : target_reprs) {
    if (target_repr == "arm") {
-      valid_places_.emplace_back(TARGET(kARM));
+      valid_places_.emplace_back(
+          Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kARM), PRECISION(kInt64), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kARM), PRECISION(kAny), DATALAYOUT(kNCHW)});
    } else if (target_repr == "opencl") {
      valid_places_.emplace_back(
          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
@@ -82,7 +94,7 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
         "command argument 'valid_targets'";
 }

-void OptBase::SetLiteOut(const std::string& lite_out_name) {
+void OptBase::SetOptimizeOut(const std::string& lite_out_name) {
  lite_out_name_ = lite_out_name;
 }

@@ -110,13 +122,15 @@ void OptBase::Run() {
 void OptBase::RunOptimize(const std::string& model_dir_path,
                          const std::string& model_path,
                          const std::string& param_path,
+                          const std::string& model_type,
                          const std::string& valid_places,
                          const std::string& optimized_out_path) {
  SetModelDir(model_dir_path);
  SetModelFile(model_path);
  SetParamFile(param_path);
+  SetModelType(model_type);
  SetValidPlaces(valid_places);
-  SetLiteOut(optimized_out_path);
+  SetOptimizeOut(optimized_out_path);
  CheckIfModelSupported(false);
  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
  opt_config_.set_valid_places(valid_places_);
@@ -248,6 +262,33 @@ void OptBase::PrintHelpInfo() {
      "-----------------------------------------------------------\n";
  std::cout << "opt version:" << opt_version << std::endl << help_info;
 }
+
+void OptBase::PrintExecutableBinHelpInfo() {
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of model optimization:\n"
+      "        `--model_dir=<model_param_dir>`\n"
+      "        `--model_file=<model_path>`\n"
+      "        `--param_file=<param_path>`\n"
+      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
+      "        `--optimize_out=<output_optimize_model_dir>`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--record_tailoring_info=(true|false)`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `--print_all_ops=true`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `--print_supported_ops=true  "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display valid operators of input targets\n"
+      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display operators in the input model\n";
+  std::cout << "paddlelite opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+}
+
 // 2. Print supported info of inputed ops
 void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
  std::vector<std::string> lite_supported_targets = {"kHost",
@@ -394,7 +435,7 @@ void OptBase::CheckIfModelSupported(bool print_ops_info) {
    for (size_t i = 0; i < valid_places_.size(); i++) {
      targets.push_back(valid_places_[i].target);
    }
-    std::sort(targets.begin(), targets.end());
+    std::stable_sort(targets.begin(), targets.end());
    targets.erase(unique(targets.begin(), targets.end()), targets.end());
    std::string targets_str = TargetToStr(targets[0]);
    for (size_t i = 1; i < targets.size(); i++) {

--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
@@ -48,20 +48,27 @@ class LITE_API OptBase {
  void SetModelFile(const std::string &model_path);
  void SetParamFile(const std::string &param_path);
  void SetValidPlaces(const std::string &valid_places);
-  void SetLiteOut(const std::string &lite_out_name);
+  void SetOptimizeOut(const std::string &lite_out_name);
  void RecordModelInfo(bool record_strip_info = true);
  // set optimized_model type
-  void SetModelType(std::string model_type);
+  void SetModelType(std::string model_type = "naive_buffer");
+  // internal inference for developer, not recommanded.
+  // choose methods of model optimizing.
+  void SetPassesInternal(const std::vector<std::string> &passes_internal = {});
  // transform and save the optimized model
  void Run();
  void RunOptimize(const std::string &model_dir_path = "",
                   const std::string &model_path = "",
                   const std::string &param_path = "",
+                   const std::string &model_type = "",
                   const std::string &valid_places = "",
                   const std::string &optimized_out_path = "");
  // fuctions of printing info
  // 1. help info
+  // 1.1 Print help info for opt python api
  void PrintHelpInfo();
+  // 1.2 Print help info for executable opt bin
+  void PrintExecutableBinHelpInfo();
  // 2. PrintOpsInfo
  void PrintOpsInfo(const std::set<std::string> &valid_ops =
                        {});  // print supported ops on target_types

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -270,6 +270,16 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #endif
 }

+void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_multi_encoder_precision' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -118,18 +118,27 @@ class LITE_API ConfigBase {
  std::string model_dir_;
  int threads_{1};
  PowerMode mode_{LITE_POWER_NO_BIND};
+  // to save subgraph model for npu/xpu/...
+  std::string subgraph_model_cache_dir_{""};

 public:
  explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
  // set Model_dir
  void set_model_dir(const std::string& x) { model_dir_ = x; }
  const std::string& model_dir() const { return model_dir_; }
-  // set Power_mode
-  void set_power_mode(PowerMode mode);
-  PowerMode power_mode() const { return mode_; }
  // set Thread
  void set_threads(int threads);
  int threads() const { return threads_; }
+  // set Power_mode
+  void set_power_mode(PowerMode mode);
+  PowerMode power_mode() const { return mode_; }
+  // set subgraph_model_dir
+  void set_subgraph_model_cache_dir(std::string subgraph_model_cache_dir) {
+    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
+  }
+  const std::string& subgraph_model_cache_dir() const {
+    return subgraph_model_cache_dir_;
+  }
 };

 /// CxxConfig is the config for the Full feature predictor.
@@ -137,6 +146,7 @@ class LITE_API CxxConfig : public ConfigBase {
  std::vector<Place> valid_places_;
  std::string model_file_;
  std::string param_file_;
+  std::vector<std::string> passes_internal_{};
  bool model_from_memory_{false};
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
@@ -165,7 +175,16 @@ class LITE_API CxxConfig : public ConfigBase {
    param_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
    model_from_memory_ = true;
  }
-
+  // internal inference to choose passes for model optimizing,
+  // it's designed for internal developer and not recommanded
+  // for comman users.
+  void set_passes_internal(
+      const std::vector<std::string>& passes_internal = {}) {
+    passes_internal_ = passes_internal;
+  }
+  const std::vector<std::string>& get_passes_internal() const {
+    return passes_internal_;
+  }
  const std::vector<Place>& valid_places() const { return valid_places_; }
  std::string model_file() const { return model_file_; }
  std::string param_file() const { return param_file_; }
@@ -216,6 +235,7 @@ class LITE_API CxxConfig : public ConfigBase {
  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
  // thread
  void set_xpu_dev_per_thread(int dev_no = 0);
+  void set_xpu_multi_encoder_precision(const std::string& precision = "int16");
 };

 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/python/bin/paddle_lite_opt
+++ b/lite/api/python/bin/paddle_lite_opt
+#!/usr/bin/env python
+# Copyright @ 2020 Baidu. All rights reserved.
+""" python wrapper file for Paddle-Lite opt tool """
+from __future__ import print_function
+import paddlelite.lite as lite
+import argparse
+
+
+def main():
+    """ main funcion """
+    a=lite.Opt()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", type=str, required=False,\
+        help="path of the model. This option will be ignored if model_file and param_file exist")
+    parser.add_argument("--model_file", type=str, required=False,\
+        help="model file path of the combined-param model.")
+    parser.add_argument("--param_file", type=str, required=False,\
+        help="param file path of the combined-param model.")
+    parser.add_argument("--optimize_out_type", type=str, required=False,default="naive_buffer",\
+        choices=['protobuf', 'naive_buffer'], \
+        help="store type of the output optimized model. protobuf/naive_buffer.")
+    parser.add_argument("--optimize_out", type=str, required=False,\
+        help="path of the output optimized model")
+    parser.add_argument("--valid_targets", type=str, required=False,default="arm",\
+        help="The targets this model optimized for, should be one of (arm,opencl, x86), splitted by space.")
+
+   # arguments of help information
+    parser.add_argument("--print_supported_ops", type=str, default="false",\
+        help="{true, false}\
+               Print supported operators on the inputed target")
+    parser.add_argument("--print_all_ops", type=str, default="false",\
+        help="{true, false}\
+               Print all the valid operators of Paddle-Lite")
+    parser.add_argument("--print_model_ops", type=str, default="false",\
+        help="{true, false}\
+               Print operators in the input model")
+    parser.add_argument("--display_kernels", type=str, default="false",\
+        help="{true, false}\
+               Display kernel information")
+
+   # arguments of strip lib according to input model
+    parser.add_argument("--record_tailoring_info", type=str, default="false",\
+        help="{true, false}\
+               Record kernels and operators information of the optimized model \
+               for tailoring compiling, information are stored into optimized  \
+               model path as hidden files")
+    parser.add_argument("--model_set", type=str, required=False,\
+        help="path of the models set. This option will be used to specific \
+              tailoring")
+
+    args = parser.parse_args()
+    """ input opt params """
+    if args.model_dir is not None:
+         a.set_model_dir(args.model_dir)
+    if args.model_set is not None:
+         a.set_modelset_dir(args.model_set)
+    if args.model_file is not None:
+         a.set_model_file(args.model_file)
+    if args.param_file is not None:
+         a.set_param_file(args.param_file)
+    if args.optimize_out_type is not None:
+         a.set_model_type(args.optimize_out_type)
+    if args.optimize_out is not None:
+         a.set_optimize_out(args.optimize_out)
+    if args.valid_targets is not None:
+         a.set_valid_places(args.valid_targets)
+    if args.param_file is not None:
+         a.set_param_file(args.param_file)
+    if args.record_tailoring_info == "true":
+         a.record_model_info(True)
+    """ print ops info """
+    if args.print_all_ops == "true":
+         a.print_all_ops()
+         return 0
+    if args.print_supported_ops == "true":
+         a.print_supported_ops()
+         return 0
+    if args.display_kernels == "true":
+         a.display_kernels_info()
+         return 0
+    if args.print_model_ops == "true":
+         a.check_if_model_supported(True);
+         return 0
+    if ((args.model_dir is None) and (args.model_file is None or args.param_file is None) and (args.model_set is None)) or (args.optimize_out is None):
+         a.executablebin_help()
+         return 1
+    else:
+         a.run()
+         return 0
+if __name__ == "__main__":
+    main()
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -19,8 +19,8 @@
 #include <iostream>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
-#include <unordered_set>
 #include <utility>
 #include <vector>

@@ -62,15 +62,18 @@ void BindLiteOpt(py::module *m) {
      .def("set_model_file", &OptBase::SetModelFile)
      .def("set_param_file", &OptBase::SetParamFile)
      .def("set_valid_places", &OptBase::SetValidPlaces)
-      .def("set_lite_out", &OptBase::SetLiteOut)
+      .def("set_optimize_out", &OptBase::SetOptimizeOut)
      .def("set_model_type", &OptBase::SetModelType)
      .def("record_model_info", &OptBase::RecordModelInfo)
+      .def("set_passes_internal", &OptBase::SetPassesInternal)
      .def("run", &OptBase::Run)
      .def("run_optimize", &OptBase::RunOptimize)
      .def("help", &OptBase::PrintHelpInfo)
+      .def("executablebin_help", &OptBase::PrintExecutableBinHelpInfo)
      .def("print_supported_ops", &OptBase::PrintSupportedOps)
      .def("display_kernels_info", &OptBase::DisplayKernelsInfo)
-      .def("print_all_ops", &OptBase::PrintAllOps);
+      .def("print_all_ops", &OptBase::PrintAllOps)
+      .def("check_if_model_supported", &OptBase::CheckIfModelSupported);
 }
 #endif
 static void BindLiteLightPredictor(py::module *m);
@@ -122,6 +125,7 @@ void BindLiteCxxConfig(py::module *m) {
      .def("param_file", &CxxConfig::param_file)
      .def("set_valid_places", &CxxConfig::set_valid_places)
      .def("set_model_buffer", &CxxConfig::set_model_buffer)
+      .def("set_passes_internal", &CxxConfig::set_passes_internal)
      .def("model_from_memory", &CxxConfig::model_from_memory);
 #ifdef LITE_WITH_ARM
  cxx_config.def("set_threads", &CxxConfig::set_threads)

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -41,6 +41,10 @@ for file in files:
        break
 LITE_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/lite'
 PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
+
+# copy scripts of paddlelite
+shutil.copy('${PADDLE_SOURCE_DIR}/lite/api/python/bin/paddle_lite_opt', LITE_PATH)
+
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/libs/'
@@ -55,7 +59,7 @@ if '${WITH_MKL}' == 'ON':
        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
 if os.name != 'nt':
-    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' " + LITE_PATH + "/lite.so"
+    COMMAND = "patchelf --set-rpath '$ORIGIN/libs/' " + LITE_PATH + "/lite.so"
    if os.system(COMMAND) != 0:
        raise Exception("patch third_party libs failed, command: %s" % COMMAND)

@@ -85,6 +89,7 @@ setup(
    name='paddlelite',
    version=PADDLELITE_VERSION,
    description='Paddle-Lite Library',
+    scripts=['lite/paddle_lite_opt'],
    packages=['paddlelite', 'paddlelite.libs'],
    package_dir=PACKAGE_DIR,
    package_data=PACKAGE_DATA,

--- a/lite/api/python/setup_mac.py.in
+++ b/lite/api/python/setup_mac.py.in
@@ -35,6 +35,8 @@ else:
 # core lib of paddlelite is stored as lite.so
 LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
 PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# copy scripts of paddlelite
+shutil.copy('${PADDLE_SOURCE_DIR}/lite/api/python/bin/paddle_lite_opt', LITE_PATH)
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
@@ -45,7 +47,7 @@ if '${WITH_MKL}' == 'ON':
    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']

 # link lite.so to paddlelite.libs
-COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
+COMMAND = "install_name_tool -add_rpath \"@loader_path/libs/\" ${PADDLE_BINARY_DIR}\
 /inference_lite_lib/python/install/lite/lite.so"
 if os.system(COMMAND) != 0:
    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
@@ -66,6 +68,7 @@ setup(
    name='paddlelite',
    version=PADDLELITE_VERSION,
    description='Paddle-Lite Library',
+    scripts=['lite/paddle_lite_opt'],
    packages=['paddlelite', 'paddlelite.libs'],
    package_dir=PACKAGE_DIR,
    package_data=PACKAGE_DATA,

--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
@@ -14,9 +14,9 @@

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/backends/apu/neuron_adapter.h"


--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -434,7 +435,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -450,7 +452,8 @@ template void conv_depthwise_3x3s1_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -467,7 +470,8 @@ template void conv_depthwise_3x3s1_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -42,8 +42,30 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                            Context<TARGET(kARM)>* ctx,
                            const float* scale) {
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];

@@ -442,7 +464,8 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,

--- a/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -447,7 +448,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -463,7 +465,8 @@ template void conv_depthwise_3x3s2_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -480,7 +483,8 @@ template void conv_depthwise_3x3s2_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -47,8 +47,30 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! prepack input to tmp buffer
  //! write output to tmp buffer
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];

@@ -442,7 +464,8 @@ void conv_3x3s2_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,
@@ -474,8 +497,30 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! prepack input to tmp buffer
  //! write output to tmp buffer
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  const int threads = ctx->threads();
@@ -698,7 +743,8 @@ void conv_3x3s2_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,

--- a/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -726,7 +727,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -742,7 +744,8 @@ template void conv_depthwise_5x5s1_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -759,7 +762,8 @@ template void conv_depthwise_5x5s1_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -746,7 +747,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -762,7 +764,8 @@ template void conv_depthwise_5x5s2_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -779,7 +782,8 @@ template void conv_depthwise_5x5s2_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -94,7 +94,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -112,7 +113,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -178,7 +180,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -196,7 +199,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -264,6 +264,7 @@ void conv1x1s1_gemm_int8(const int8_t* i_data,
  }
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
  //! use gemv when the output channel size = 1
  for (int b = 0; b < num; ++b) {
    // dC
@@ -283,8 +284,11 @@ void conv1x1s1_gemm_int8(const int8_t* i_data,
                  scale_group,
                  flag_bias,
                  bias_group,
-                  flag_relu,
-                  ctx);
+                  act_param.has_active,
+                  act_param.active_type,
+                  ctx,
+                  act_param.Relu_clipped_coef,
+                  act_param.Leaky_relu_alpha);
      } else {
        gemm_prepack_int8(weights_group,
                          din_group,
@@ -294,9 +298,9 @@ void conv1x1s1_gemm_int8(const int8_t* i_data,
                          n,
                          k,
                          flag_bias,
-                          flag_relu,
                          false,
                          scale_group,
+                          act_param,
                          ctx);
      }
    }
@@ -474,6 +478,8 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;

+  auto act_param = param.activation_param;
+
  int hblock = get_hblock_int8(ctx);
  int k_roundup = ROUNDUP(k, KBLOCK_INT8);
  int m_roundup = ROUNDUP(m, hblock);
@@ -523,8 +529,11 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
                  scale_group,
                  flag_bias,
                  bias_group,
-                  flag_relu,
-                  ctx);
+                  act_param.has_active,
+                  act_param.active_type,
+                  ctx,
+                  act_param.Relu_clipped_coef,
+                  act_param.Leaky_relu_alpha);
      } else {
        gemm_prepack_int8(weights_group,
                          dB,
@@ -534,9 +543,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
                          n,
                          k,
                          flag_bias,
-                          flag_relu,
                          false,
                          scale_group,
+                          act_param,
                          ctx);
      }
    }
@@ -781,8 +790,30 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -790,7 +821,8 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -807,7 +839,8 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -840,8 +873,30 @@ void conv_depthwise_3x3_int8_int8(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -849,7 +904,8 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -866,7 +922,8 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -899,8 +956,30 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_5x5s1_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -908,7 +987,8 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -925,7 +1005,8 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -958,8 +1039,30 @@ void conv_depthwise_5x5_int8_int8(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_5x5s1_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -967,7 +1070,8 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -984,7 +1088,8 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,

--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
--- a/lite/backends/arm/math/gemm_prepacked_int8.h
+++ b/lite/backends/arm/math/gemm_prepacked_int8.h
@@ -16,6 +16,7 @@
 #include <cmath>
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"

 namespace paddle {
 namespace lite {
@@ -80,9 +81,9 @@ void gemm_prepack_int8(const int8_t* A_packed,
                       int N,
                       int K,
                       bool is_bias,
-                       bool is_relu,
                       bool is_transB,
                       const float* scale,
+                       const operators::ActivationParam act_param,
                       ARMContext* ctx);

 #define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))

--- a/lite/backends/arm/math/gemm_s8.cc
+++ b/lite/backends/arm/math/gemm_s8.cc
@@ -30,8 +30,8 @@ void gemm_s8(bool is_transA,
             Dtype* C,
             const float* bias,
             bool is_bias,
-             bool is_relu,
             const float* scale,
+             const operators::ActivationParam act_param,
             ARMContext* ctx) {
  int hblock = get_hblock_int8(ctx);
  int m_roundup = hblock * ((M + hblock - 1) / hblock);
@@ -42,7 +42,7 @@ void gemm_s8(bool is_transA,
  prepackA_int8(packed_A, A, lda, 0, M, 0, K, is_transA, ctx);

  gemm_prepack_int8(
-      packed_A, B, bias, C, M, N, K, is_bias, is_relu, is_transB, scale, ctx);
+      packed_A, B, bias, C, M, N, K, is_bias, is_transB, scale, act_param, ctx);
  TargetFree(TargetType::kARM, packed_A);
 }

@@ -56,8 +56,8 @@ template void gemm_s8<float>(bool is_transA,
                             float* C,
                             const float* bias,
                             bool is_bias,
-                             bool is_relu,
                             const float* scale,
+                             const operators::ActivationParam act_param,
                             ARMContext* ctx);

 template void gemm_s8<int8_t>(bool is_transA,
@@ -70,8 +70,8 @@ template void gemm_s8<int8_t>(bool is_transA,
                              int8_t* C,
                              const float* bias,
                              bool is_bias,
-                              bool is_relu,
                              const float* scale,
+                              const operators::ActivationParam act_param,
                              ARMContext* ctx);

 }  // namespace math

--- a/lite/backends/arm/math/gemm_s8.h
+++ b/lite/backends/arm/math/gemm_s8.h
@@ -34,8 +34,8 @@ void gemm_s8(bool is_transA,
             Dtype* C,
             const float* bias,
             bool is_bias,
-             bool is_relu,
             const float* scale,
+             const operators::ActivationParam act_param,
             ARMContext* ctx);

 }  // namespace math

--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ b/lite/backends/arm/math/gemv_arm_int8.cc
@@ -27,7 +27,10 @@ inline void write_gemv_out(const int* in,
                           const float* scale,
                           const float* bias,
                           int size,
-                           bool is_relu);
+                           bool flag_act,
+                           lite_api::ActivationType act,
+                           float six,
+                           float alpha);

 template <>
 inline void write_gemv_out(const int* in,
@@ -35,7 +38,10 @@ inline void write_gemv_out(const int* in,
                           const float* scale,
                           const float* bias,
                           int size,
-                           bool is_relu) {
+                           bool flag_act,
+                           lite_api::ActivationType act,
+                           float six,
+                           float alpha) {
  int i = 0;
  float32x4_t vzero = vdupq_n_f32(0.f);
  for (; i < size - 7; i += 8) {
@@ -49,9 +55,25 @@ inline void write_gemv_out(const int* in,
    float32x4_t vinf1 = vcvtq_f32_s32(vin1);
    vout0 = vmlaq_f32(vout0, vinf0, vscale0);
    vout1 = vmlaq_f32(vout1, vinf1, vscale1);
-    if (is_relu) {
+    if (flag_act) {
+      if (act == lite_api::ActivationType::kRelu) {
        vout0 = vmaxq_f32(vout0, vzero);
        vout1 = vmaxq_f32(vout1, vzero);
+      } else if (act == lite_api::ActivationType::kRelu6) {
+        float32x4_t vsix = vdupq_n_f32(six);
+        vout0 = vmaxq_f32(vout0, vzero);
+        vout1 = vmaxq_f32(vout1, vzero);
+        vout0 = vminq_f32(vout0, vsix);
+        vout1 = vminq_f32(vout1, vsix);
+      } else if (act == lite_api::ActivationType::kLeakyRelu) {
+        float32x4_t valpha = vdupq_n_f32(alpha);
+        uint32x4_t maska = vcgeq_f32(vout0, vzero);
+        uint32x4_t maskb = vcgeq_f32(vout1, vzero);
+        float32x4_t suma = vmulq_f32(vout0, valpha);
+        float32x4_t sumb = vmulq_f32(vout1, valpha);
+        vout0 = vbslq_f32(maska, vout0, suma);
+        vout1 = vbslq_f32(maskb, vout1, sumb);
+      }
    }
    vst1q_f32(out, vout0);
    vst1q_f32(out + 4, vout1);
@@ -63,7 +85,15 @@ inline void write_gemv_out(const int* in,
  for (; i < size; ++i) {
    out[0] = *(in++) * *(scale)++;
    out[0] += bias ? *(bias++) : 0.f;
-    out[0] = is_relu ? (out[0] > 0.f ? out[0] : 0.f) : out[0];
+    if (flag_act) {
+      if (act == lite_api::ActivationType::kRelu) {
+        out[0] = out[0] > 0.f ? out[0] : 0.f;
+      } else if (act == lite_api::ActivationType::kRelu6) {
+        out[0] = out[0] > 0.f ? (out[0] > six ? six : out[0]) : 0.f;
+      } else if (act == lite_api::ActivationType::kLeakyRelu) {
+        out[0] = out[0] > 0.f ? out[0] : out[0] * alpha;
+      }
+    }
    out++;
  }
 }
@@ -74,24 +104,40 @@ inline void write_gemv_out(const int* in,
                           const float* scale,
                           const float* bias,
                           int size,
-                           bool flag_relu) {
+                           bool flag_act,
+                           lite_api::ActivationType act,
+                           float six,
+                           float alpha) {
  if (bias) {
    for (int i = 0; i < size; ++i) {
-      out[0] =
-          saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++)));
-      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
-      if (flag_relu) {
-        out[0] = out[0] > 0 ? out[0] : 0;
+      float tmp = *(in++) * *(scale++) + *(bias++);
+      if (flag_act) {
+        if (act == lite_api::ActivationType::kRelu) {
+          tmp = tmp > 0.f ? tmp : 0.f;
+        } else if (act == lite_api::ActivationType::kRelu6) {
+          tmp = tmp > 0.f ? (tmp > six ? six : tmp) : 0.f;
+        } else if (act == lite_api::ActivationType::kLeakyRelu) {
+          tmp = tmp > 0.f ? tmp : (tmp * alpha);
        }
+      }
+      out[0] = saturate_cast<signed char>(roundf(tmp));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
      out++;
    }
  } else {
    for (int i = 0; i < size; ++i) {
-      out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++)));
-      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
-      if (flag_relu) {
-        out[0] = out[0] > 0 ? out[0] : 0;
+      float tmp = *(in++) * *(scale++);
+      if (flag_act) {
+        if (act == lite_api::ActivationType::kRelu) {
+          tmp = tmp > 0.f ? tmp : 0.f;
+        } else if (act == lite_api::ActivationType::kRelu6) {
+          tmp = tmp > 0.f ? (tmp > six ? six : tmp) : 0.f;
+        } else if (act == lite_api::ActivationType::kLeakyRelu) {
+          tmp = tmp > 0.f ? tmp : tmp * alpha;
        }
+      }
+      out[0] = saturate_cast<signed char>(roundf(tmp));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
      out++;
    }
  }
@@ -107,7 +153,10 @@ bool gemv_int8_oth(const int8_t* A,
                   const float* scale,
                   bool is_bias,
                   const float* bias,
-                   bool is_relu) {
+                   bool flag_act,
+                   lite_api::ActivationType act,
+                   float six,
+                   float alpha) {
  if (transA) {
    LOG(ERROR) << "ERROR: sgemv, transA is not supported now";
    return false;
@@ -260,7 +309,8 @@ bool gemv_int8_oth(const int8_t* A,
      ptr_out[7] += ptr_in[i] * ptr_w7[i];
    }

-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 8, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 8, flag_act, act, six, alpha);
  }

 //! deal with remains
@@ -304,7 +354,8 @@ bool gemv_int8_oth(const int8_t* A,
    for (int i = 0; i < tail; ++i) {
      ptr_out[0] += ptr_in[i] * ptr_w0[i];
    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 1, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 1, flag_act, act, six, alpha);
  }
 #else  //  __aarch64__
  int out_cnt = M >> 2;
@@ -398,7 +449,8 @@ bool gemv_int8_oth(const int8_t* A,
      ptr_out[2] += ptr_in[i] * ptr_w2[i];
      ptr_out[3] += ptr_in[i] * ptr_w3[i];
    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 4, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 4, flag_act, act, six, alpha);
  }
 //! deal with remains
 #pragma omp parallel for
@@ -439,7 +491,8 @@ bool gemv_int8_oth(const int8_t* A,
    for (int i = 0; i < tail; ++i) {
      ptr_out[0] += ptr_in[i] * ptr_w0[i];
    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 1, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 1, flag_act, act, six, alpha);
  }
 #endif  //  __aarch64__
  return true;
@@ -456,7 +509,10 @@ bool gemv_int8_sdot(const int8_t* A,
                    const float* scale,
                    bool is_bias,
                    const float* bias,
-                    bool is_relu) {
+                    bool flag_act,
+                    lite_api::ActivationType act,
+                    float six,
+                    float alpha) {
  if (transA) {
    LOG(ERROR) << "ERROR: sgemv, transA is not supported now";
    return false;
@@ -594,7 +650,8 @@ bool gemv_int8_sdot(const int8_t* A,
      ptr_out[6] += ptr_in[i] * ptr_w6[i];
      ptr_out[7] += ptr_in[i] * ptr_w7[i];
    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 8, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 8, flag_act, act, six, alpha);
  }
 //! deal with remains
 #pragma omp parallel for
@@ -634,7 +691,8 @@ bool gemv_int8_sdot(const int8_t* A,
    for (int i = 0; i < tail; ++i) {
      ptr_out[0] += ptr_in[i] * ptr_w0[i];
    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr, bias_ptr, 1, is_relu);
+    write_gemv_out(
+        ptr_out, out_ptr, scale_ptr, bias_ptr, 1, flag_act, act, six, alpha);
  }
  return true;
 }
@@ -650,19 +708,22 @@ bool gemv_int8<float>(const int8_t* A,
                      const float* scale,
                      bool is_bias,
                      const float* bias,
-                      bool is_relu,
-                      const ARMContext* ctx) {
+                      bool flag_act,
+                      lite_api::ActivationType act,
+                      const ARMContext* ctx,
+                      float six,
+                      float alpha) {
 #if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
  if (ctx->has_dot()) {
    return gemv_int8_sdot<float>(
-        A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+        A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
  } else {
    return gemv_int8_oth<float>(
-        A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+        A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
  }
 #else
  return gemv_int8_oth<float>(
-      A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+      A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
 #endif
 }

@@ -676,19 +737,22 @@ bool gemv_int8<int8_t>(const int8_t* A,
                       const float* scale,
                       bool is_bias,
                       const float* bias,
-                       bool is_relu,
-                       const ARMContext* ctx) {
+                       bool flag_act,
+                       lite_api::ActivationType act,
+                       const ARMContext* ctx,
+                       float six,
+                       float alpha) {
 #if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
  if (ctx->has_dot()) {
    return gemv_int8_sdot<int8_t>(
-        A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+        A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
  } else {
    return gemv_int8_oth<int8_t>(
-        A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+        A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
  }
 #else
  return gemv_int8_oth<int8_t>(
-      A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
+      A, x, y, transA, M, N, scale, is_bias, bias, flag_act, act, six, alpha);
 #endif
 }


--- a/lite/backends/arm/math/gemv_arm_int8.h
+++ b/lite/backends/arm/math/gemv_arm_int8.h
@@ -32,8 +32,11 @@ bool gemv_int8(const int8_t* A,
               const float* scale,
               bool is_bias,
               const float* bias,
-               bool is_relu,
-               const ARMContext* ctx);
+               bool flag_act,
+               lite_api::ActivationType act,
+               const ARMContext* ctx,
+               float six = 6.f,
+               float alpha = 1.f);

 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
--- a/lite/backends/arm/math/sequence2batch.h
+++ b/lite/backends/arm/math/sequence2batch.h
@@ -109,9 +109,9 @@ class LoDTensor2BatchFunctor {
      seq_info.emplace_back(lod[seq_id], length, seq_id);
    }

-    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
-      return a.length > b.length;
-    });
+    std::stable_sort(seq_info.begin(),
+                     seq_info.end(),
+                     [](SeqInfo a, SeqInfo b) { return a.length > b.length; });

    // Calculate the start position of each batch.
    // example:  sequences = {s0, s1, s2}

--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -17,6 +17,7 @@
 #include <cublas_api.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+#include <cuda_fp16.h>
 #include <cudnn.h>
 #include "lite/utils/cp_logging.h"

@@ -64,6 +65,9 @@ inline int CUDA_GET_BLOCKS(const int N) {
 inline int CUDA_GET_BLOCKS(const int N, const int base) {
  return (N + base - 1) / base;
 }
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)

 namespace paddle {
 namespace lite {

--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -8,8 +8,7 @@ nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
 nv_library(cuda_type_trans SRCS type_trans.cu DEPS ${cuda_static_deps}) 
 nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
-nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
-cuda_type_trans ${cuda_static_deps})
+nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})

--- a/lite/backends/cuda/math/activation.cu
+++ b/lite/backends/cuda/math/activation.cu
--- a/lite/backends/cuda/math/activation.h
+++ b/lite/backends/cuda/math/activation.h
--- a/lite/backends/cuda/math/batched_gemm.cc
+++ b/lite/backends/cuda/math/batched_gemm.cc
--- a/lite/backends/cuda/math/conv_op_cache_cudnn.h
+++ b/lite/backends/cuda/math/conv_op_cache_cudnn.h
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
--- a/lite/backends/cuda/math/gemm.cc
+++ b/lite/backends/cuda/math/gemm.cc
--- a/lite/backends/cuda/math/type_trans.cu
+++ b/lite/backends/cuda/math/type_trans.cu
--- a/lite/backends/cuda/math/type_trans.h
+++ b/lite/backends/cuda/math/type_trans.h
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
--- a/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/pixel_shuffle_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pixel_shuffle_kernel.cl
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
--- a/lite/backends/rknpu/device.h
+++ b/lite/backends/rknpu/device.h
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
--- a/lite/backends/x86/jit/gen/act.h
+++ b/lite/backends/x86/jit/gen/act.h
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
--- a/lite/backends/x86/jit/gen/gru.h
+++ b/lite/backends/x86/jit/gen/gru.h
--- a/lite/backends/x86/jit/gen/hopv.h
+++ b/lite/backends/x86/jit/gen/hopv.h
--- a/lite/backends/x86/jit/gen/lstm.h
+++ b/lite/backends/x86/jit/gen/lstm.h
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
--- a/lite/backends/x86/jit/gen/sgd.h
+++ b/lite/backends/x86/jit/gen/sgd.h
--- a/lite/backends/x86/jit/gen/vbroadcast.h
+++ b/lite/backends/x86/jit/gen/vbroadcast.h
--- a/lite/backends/x86/jit/helper.h
+++ b/lite/backends/x86/jit/helper.h
--- a/lite/backends/x86/jit/kernel_pool.cc
+++ b/lite/backends/x86/jit/kernel_pool.cc
--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
--- a/lite/backends/x86/math/tree2col.h
+++ b/lite/backends/x86/math/tree2col.h
--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/exported_symbols.lds
+++ b/lite/core/exported_symbols.lds
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/core/lite.map
+++ b/lite/core/lite.map
--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
--- a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
--- a/lite/core/mir/fusion/conv_elementwise_fuser.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuser.cc
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.h
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.h
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/memory_optimize_pass.h
+++ b/lite/core/mir/memory_optimize_pass.h
--- a/lite/core/mir/multi_stream_analysis_pass.cc
+++ b/lite/core/mir/multi_stream_analysis_pass.cc
--- a/lite/core/mir/multi_stream_analysis_pass.h
+++ b/lite/core/mir/multi_stream_analysis_pass.h
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
--- a/lite/core/mir/pattern_matcher_high_api.cc
+++ b/lite/core/mir/pattern_matcher_high_api.cc
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
--- a/lite/core/mir/quantized_op_attributes_inference_pass.h
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.h
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
--- a/lite/core/type_system.h
+++ b/lite/core/type_system.h
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/build.bat
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.bat
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/build.bat
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.bat
--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/argmax_compute.cc
+++ b/lite/kernels/arm/argmax_compute.cc
--- a/lite/kernels/arm/argmax_compute.h
+++ b/lite/kernels/arm/argmax_compute.h
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
--- a/lite/kernels/arm/conv_compute.h
+++ b/lite/kernels/arm/conv_compute.h
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/conv_depthwise.h
+++ b/lite/kernels/arm/conv_depthwise.h
--- a/lite/kernels/arm/conv_direct.cc
+++ b/lite/kernels/arm/conv_direct.cc
--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
--- a/lite/kernels/arm/conv_gemmlike.cc
+++ b/lite/kernels/arm/conv_gemmlike.cc
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
--- a/lite/kernels/arm/conv_transpose_compute.h
+++ b/lite/kernels/arm/conv_transpose_compute.h
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/generate_proposals_compute.cc
+++ b/lite/kernels/arm/generate_proposals_compute.cc
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/calib_compute.cu
+++ b/lite/kernels/cuda/calib_compute.cu
--- a/lite/kernels/cuda/calib_compute.h
+++ b/lite/kernels/cuda/calib_compute.h
--- a/lite/kernels/cuda/conv_compute.cc
+++ b/lite/kernels/cuda/conv_compute.cc
--- a/lite/kernels/cuda/conv_compute.h
+++ b/lite/kernels/cuda/conv_compute.h
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
--- a/lite/kernels/cuda/feed_compute.cc
+++ b/lite/kernels/cuda/feed_compute.cc
--- a/lite/kernels/cuda/search_grnn_compute.cu
+++ b/lite/kernels/cuda/search_grnn_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
--- a/lite/kernels/cuda/var_conv_2d_compute_test.cc
+++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
--- a/lite/kernels/arm/unsqueeze_compute.h
+++ b/lite/kernels/arm/unsqueeze_compute.h
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/npu/bridges/dropout_op.cc
+++ b/lite/kernels/npu/bridges/dropout_op.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op_test.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
--- a/lite/kernels/opencl/expand_image_compute.cc
+++ b/lite/kernels/opencl/expand_image_compute.cc
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pixel_shuffle_image_compute.cc
+++ b/lite/kernels/opencl/pixel_shuffle_image_compute.cc
--- a/lite/kernels/opencl/pixel_shuffle_image_compute_test.cc
+++ b/lite/kernels/opencl/pixel_shuffle_image_compute_test.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/rknpu/bridges/graph.h
+++ b/lite/kernels/rknpu/bridges/graph.h
--- a/lite/kernels/rknpu/bridges/utility.cc
+++ b/lite/kernels/rknpu/bridges/utility.cc
--- a/lite/kernels/rknpu/bridges/utility.h
+++ b/lite/kernels/rknpu/bridges/utility.h
--- a/lite/kernels/x86/elementwise_compute.cc
+++ b/lite/kernels/x86/elementwise_compute.cc
--- a/lite/kernels/x86/elementwise_compute.h
+++ b/lite/kernels/x86/elementwise_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
--- a/lite/model_parser/compatibility.h
+++ b/lite/model_parser/compatibility.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/cpp/op_desc.cc
+++ b/lite/model_parser/cpp/op_desc.cc
--- a/lite/model_parser/cpp/op_desc.h
+++ b/lite/model_parser/cpp/op_desc.h
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/desc_apis.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/naive_buffer/CMakeLists.txt
+++ b/lite/model_parser/naive_buffer/CMakeLists.txt
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
--- a/lite/operators/io_copy_op.h
+++ b/lite/operators/io_copy_op.h
--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
--- a/lite/operators/layout_op.h
+++ b/lite/operators/layout_op.h
--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
--- a/lite/operators/mean_op.h
+++ b/lite/operators/mean_op.h
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
--- a/lite/operators/reduce_max_op.cc
+++ b/lite/operators/reduce_max_op.cc
--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
--- a/lite/operators/squeeze_op.h
+++ b/lite/operators/squeeze_op.h
--- a/lite/tests/api/test_mobilenetv1_int8_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
--- a/lite/tests/kernels/reduce_max_compute_test.cc
+++ b/lite/tests/kernels/reduce_max_compute_test.cc
--- a/lite/tests/kernels/reduce_mean_compute_test.cc
+++ b/lite/tests/kernels/reduce_mean_compute_test.cc
--- a/lite/tests/kernels/reduce_prod_compute_test.cc
+++ b/lite/tests/kernels/reduce_prod_compute_test.cc
--- a/lite/tests/kernels/reduce_sum_compute_test.cc
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
--- a/lite/tests/kernels/squeeze_compute_test.cc
+++ b/lite/tests/kernels/squeeze_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh
--- a/lite/tools/check_pr_approval.py
+++ b/lite/tools/check_pr_approval.py
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
--- a/lite/utils/any.cc
+++ b/lite/utils/any.cc
--- a/lite/utils/any.h
+++ b/lite/utils/any.h
--- a/lite/utils/cp_logging.h
+++ b/lite/utils/cp_logging.h
--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/factory.h
+++ b/lite/utils/factory.h
--- a/lite/utils/float16.h
+++ b/lite/utils/float16.h
--- a/lite/utils/float16_test.cc
+++ b/lite/utils/float16_test.cc
--- a/lite/utils/float16_test.cu
+++ b/lite/utils/float16_test.cu
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
--- a/tools/coverage/paddle_lite_coverage.sh
+++ b/tools/coverage/paddle_lite_coverage.sh
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py