fixed merge conflicts

8d3d2e58 · chonwhite · fb345c46 · 50ce8ddd · 8d3d2e58 · 8d3d2e58
83 changed file
--- a/docs/benchmark/benchmark.md
+++ b/docs/benchmark/benchmark.md
@@ -18,7 +18,7 @@

 * 测试机器(android ndk ndk-r17c)
   *  骁龙855
-      * xiaomi mi9, snapdragon 855 
+      * xiaomi mi9, snapdragon 855 (enable sdot instruction)
      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz

   *  骁龙845
@@ -33,7 +33,7 @@
      * HUAWEI Mate10
 
 * 测试说明
-    * branch: release/v2.3.0
+    * branch: release/v2.6.0
    * warmup=10, repeats=30，统计平均时间，单位是ms
    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
@@ -48,75 +48,75 @@
 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 |
-mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 |
-shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 |
-squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 |
-mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 |
+mobilenet_v1 |35.11 |20.67 |11.83 |30.56 |18.59 |10.44 |
+mobilenet_v2 |26.36 |15.83 |9.29 |21.64 |13.25 |7.95 |
+shufflenet_v2 |4.56 |3.14 |2.35 |4.07 |2.89 |2.28 |
+squeezenet_v1.1 |21.27 |13.55 |8.49 |18.05 |11.51 |7.83 |
+mnasnet |21.40 |13.18 |7.63 |18.84 |11.40 |6.80 |


 骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 |
-mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 |
-shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 |
-squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 |
-mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 |
+mobilenet_v1 |65.56 |37.17 |19.65 |63.23 |32.98 |17.68 |
+mobilenet_v2 |45.89 |25.20 |14.39 |41.03 |22.94 |12.98 |
+shufflenet_v2 |7.31 |4.66 |3.27 |7.08 |4.71 |3.41 |
+squeezenet_v1.1 |36.98 |22.53 |13.45 |34.27 |20.96 |12.60 |
+mnasnet |39.85 |23.64 |12.25 |37.81 |20.70 |11.81 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 |
-mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 |
-shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 |
-squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 |
-mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 |
+mobilenet_v1 |92.77 |51.56 |30.14 |87.46 |48.02 |26.42 |
+mobilenet_v2 |65.78 |36.52 |22.34 |58.31 |33.04 |19.87 |
+shufflenet_v2 |10.39 |6.26 |4.46 |9.72 |6.19 |4.41 |
+squeezenet_v1.1 |53.59 |33.16 |20.13 |51.56 |31.81 |19.10 |
+mnasnet |57.44 |32.62 |19.47 |54.99 |30.69 |17.98 |

 #### caffe model

 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 |
-mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 |
-shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 |
+mobilenet_v1 |32.38 |18.65 |10.69 |30.75 |18.11 |9.88 |
+mobilenet_v2 |29.45 |17.86 |10.81 |26.61 |16.26 |9.67 |
+shufflenet_v2 |5.04 |3.14 |2.20 |4.09 |2.85 |2.25 |


 骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 |
-mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 |
-shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 |
+mobilenet_v1 |65.26 |35.19 |19.11 |61.42 |33.15 |17.48 |
+mobilenet_v2 |55.59 |31.31 |17.68 |51.54 |29.69 |16.00 |
+shufflenet_v2 |7.42 |4.73 |3.33 |7.18 |4.75 |3.39 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 |
-mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 |
-shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 |
+mobilenet_v1 |95.38 |52.16 |30.37 |92.10 |46.71 |26.31 |
+mobilenet_v2 |82.89 |45.49 |28.14 |74.91 |41.88 |25.25 |
+shufflenet_v2 |10.25 |6.36 |4.42 |9.68 |6.20 |4.42 |

 #### int8量化模型测试数据

 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
-mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
+mobilenet_v1 |37.18 |21.71 |11.16 | 14.41 |8.34 |4.37 |
+mobilenet_v2 |27.95 |16.57 |8.97 | 13.68 |8.16 |4.67 |


 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |60.76 |32.25 |16.66 |56.57 |29.84 |15.24 |
-mobilenet_v2 |49.38 |31.10 |22.07 |47.52 |28.18 |19.24 |
+mobilenet_v1 |61.63 |32.60 |16.49 |57.36 |29.74 |15.50 |
+mobilenet_v2 |47.13 |25.62 |13.56 |41.87 |22.42 |11.72 |


 麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |65.95 |34.39 |18.68 |60.86 |30.98 |16.31 |
-mobilenet_v2 |68.87 |39.39 |24.43 |65.57 |37.31 |20.87 |
+mobilenet_v1 |63.13 |32.63 |16.85 |58.92 |29.96 |15.42 |
+mobilenet_v2 |48.60 |25.43 |13.76 |43.06 |22.10 |12.09 |
--- a/docs/demo_guides/cpp_demo.md
+++ b/docs/demo_guides/cpp_demo.md
@@ -32,14 +32,26 @@ tar zxf mobilenet_v1.tar.gz

 ![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png)

-（2）下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型：
+（2）模型转换

-```shell
-wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
-chmod +x opt
-./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
-```

+  - v2.6.0版本之前
+
+  下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型
+
+  ```shell
+  wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
+  chmod +x opt
+  ./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+  ```
+  - v2.6.0版本以及后续版本
+
+  安装paddlelite，终端输入命令转化模型
+ 
+  ```shell
+  python -m pip install paddlelite
+  paddle_opt_lite --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+  ```
 **结果如下图所示：**

 ![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png)

--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
+# Python Demo
+
+## 1. 下载最新版本python预测库
+
+```shell
+python -m pip install paddlelite
+```
+
+## 2. 转化模型
+
+PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。
+
+以`mobilenet_v1`模型为例：
+
+（1）下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压：
+
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxf mobilenet_v1.tar.gz
+```
+
+（2）使用opt工具：
+
+ 从磁盘加载模型时，根据模型和参数文件存储方式不同，加载模型和参数的路径有两种形式。
+
+- Linux环境
+  - 非combined形式：模型文件夹model_dir下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。
+
+  ```shell
+  paddle_lite_opt --model_dir=./mobilenet_v1  \
+                  --optimize_out=mobilenet_v1_opt \
+                  --optimize_out_type=naive_buffer \
+                  --valid_targets=x86
+  ```
+  - combined形式：模型文件夹model_dir下只有一个模型文件__model__和一个参数文件__params__时，传入模型文件和参数文件路径
+
+  ```shell
+  paddle_lite_opt --model_file=./mobilenet_v1/__model__ \
+                  --param_file=./mobilenet_v1/__params__  \
+                  --optimize_out=mobilenet_v1_opt \
+                  --optimize_out_type=naive_buffer \
+                  --valid_targets=x86
+  ```
+
+- windows环境
+
+windows 暂不支持命令行方式直接运行模型转换器，需要编写python脚本
+
+```python
+import paddlelite.lite as lite
+
+a=lite.Opt()
+# 非combined形式
+a.set_model_dir("D:\\YOU_MODEL_PATH\\mobilenet_v1")
+
+# conmbined形式
+# a.set_model_file("D:\\YOU_MODEL_PATH\\mobilenet_v1\\__model__")
+# a.set_param_file("D:\\YOU_MODEL_PATH\\mobilenet_v1\\__params__")
+
+a.set_optimize_out("mobilenet_v1_opt")
+a.set_valid_places("x86")
+
+a.run()
+```
+
+- MAC 环境
+
+Opt工具使用方式同Linux（MAC环境暂不支持python端预测，下个版本会修复该问题）
+
+## 3. 编写预测程序
+
+准备好预测库和模型，我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考，创建文件mobilenetV1_light_api.py，
+python demo 完整代码位于 [demo/python](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/python/mobilenetv1_light_api.py) 。
+
+(1) 设置config信息
+```python
+from paddlelite.lite import *
+
+config = MobileConfig()
+config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
+```
+
+(2) 创建predictor
+
+```python
+predictor = create_paddle_predictor(config)
+```
+
+(3) 设置输入数据
+```python
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+```
+
+(4) 执行预测
+```python
+predictor.run()
+```
+
+(5) 得到输出数据
+```python
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+## 4. 运行文件
+```shell
+python mobilenetV1_light_api.py
+```
--- a/docs/demo_guides/x86.md
+++ b/docs/demo_guides/x86.md
@@ -4,8 +4,6 @@

 Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。

-(注意：非docker Linux环境需要是Ubuntu16.04)
-
 ### 编译

 1、 下载代码
@@ -20,10 +18,11 @@ git checkout release/v2.6.0

 ```bash
 cd Paddle-Lite
-./lite/tools/build.sh x86
+./lite/tools/build.sh --build_python=ON x86

 # 其他可选择编译选项
 # --with_log=OFF 关闭LOG信息输出
+# --build_python=OFF 编译python预测库
 ```

 ### 编译结果说明
@@ -53,8 +52,17 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
 - `mobilenetv1_full` ：使用full_api 执行mobilenet_v1预测的C++ demo
 - `mobilenetv1_light` ：使用light_api 执行mobilenet_v1预测的C++ demo

+5、 `demo/python`文件夹：x86预测库的Python 示例demo
+
+- `mobilenetv1_full_api.py` ：使用full_api 执行mobilenet_v1预测的Python demo
+- `mobilenetv1_light_api.py` ：使用light_api 执行mobilenet_v1预测的Python demo

+6、 `python`文件夹：包含python的库文件和对应的.whl包

+- `install`文件夹：编译成功的.whl包位于`install/dist/*.whl`
+- `lib`文件夹：.whl包依赖的库文件
+
+**(若不需要编译python预测库，则将编译命令替换为`./lite/tools/build.sh x86`)**

 ### x86预测API使用示例

@@ -64,7 +72,8 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
 mobilenetv1_full/
 |-- CMakeLists.txt
 |-- build.sh
-`-- mobilenet_full_api.cc
+|-- build.bat
+-- mobilenet_full_api.cc
 ```

 本demo使用cmake构建`CMakeLists.txt`为cmake脚本，`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
@@ -168,8 +177,8 @@ int main(int argc, char** argv) {
 #### 编译环境需求

 - Windows 10 专业版
-  - 目前Windows暂不支持GPU模式
- *Python 版本 2.7/3.5.1+/3.6/3.7 (64 bit)*
+  - 目前Windows暂不支持GPU编译
+- *Python 版本 2.7/3.5.1+ (64 bit)*
 - *pip 或 pip3 版本 9.0.1+ (64 bit)*
 - *Visual Studio 2015 Update3*

@@ -187,15 +196,15 @@ int main(int argc, char** argv) {
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
 # 切换到release分支
-git checkout release/v2.3
+git checkout release/v2.6.0
 ```
-2、 源码编译
+2、 源码编译(需要按照提示输入对应的参数)

-```bash
+```dos
 cd Paddle-Lite
-lite/tools/build_windows.bat with_extra with_python with_profile
+lite\tools\build_windows.bat with_extra with_python with_profile
 ```
-编译脚本`lite/tools/build.bat`，追加参数说明：
+编译脚本`build_windows.bat`，追加参数说明：

 |   参数     |     介绍     |     值     |
 |-----------|-------------|-------------|
@@ -203,40 +212,62 @@ lite/tools/build_windows.bat with_extra with_python with_profile
 |  with_python | 可选，是否编译python预测库（默认为OFF） 。 | `ON`、`OFF` |
 |  with_profile | 可选，是否支持分析器模式（默认为OFF） 。 | `ON`、`OFF` |

-### 编译结果
+### 编译结果说明

 x86编译结果位于 `build.lite.x86/inference_lite_lib`
 **具体内容**说明：

-1、 `bin`文件夹：可执行工具文件 `test_model_bin`
-
-2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+1、 `cxx`文件夹：包含c++的库文件与相应的头文件

 - `include`  : 头文件
 - `lib` : 库文件
-  - 打包的静态库文件：
+  - 静态库文件：
    - `libpaddle_api_full_bundled.lib`  ：full_api 静态库
    - `libpaddle_api_light_bundled.lib` ：light_api 静态库

-3、 `third_party` 文件夹：第三方库文件
+2、 `third_party` 文件夹：依赖的第三方预测库mklml
+
+- mklml : Paddle-Lite预测库依赖的mklml数学库
+
+3、 `demo/cxx`文件夹：x86预测库的C++ 示例demo
+
+- `mobilenetv1_full` ：使用full_api 执行mobilenet_v1预测的C++ demo
+- `mobilenetv1_light` ：使用light_api 执行mobilenet_v1预测的C++ demo
+
+4、 `demo/python`: x86预测库的Python示例demo
+
+- `mobilenetv1_full_api.py`:使用full_api 执行mobilenet_v1预测的Python demo
+- `mobilenetv1_light_api.py`:使用full_api 执行mobilenet_v1预测的Python demo

+5、 `python`文件夹：包含python的库文件和对应的.whl包
+
+- `install`文件夹：编译成功的.whl包位于`install/dist/*.whl`
+- `lib`文件夹：.whl包依赖的库文件
 ### x86预测API使用示例

-1、我们提供Windows环境下x86 API运行mobilenet_v1的示例：[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下>：
+1、`mobilenetv1_full`目录结构

-![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
+```bash
+mobilenetv1_full/
+|-- CMakeLists.txt
+|-- build.sh
+|-- build.bat
+`-- mobilenet_full_api.cc
+```

-`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.bat`为编译的脚本。
+本demo使用cmake构建`CMakeLists.txt`为cmake脚本，`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为Linux x86编译的脚本，`build.bat`为windows x86编译脚本。

-2、demo内容与使用方法
+2、demo使用方法

 ``` bash
-# 1、编译(需在vs2015的命令窗口执行该脚本)
+# 1、编译
+cd mobilenetv1_full
 build.bat
+cd build
 ```
-编译结果为当前目录下的 `Release\\mobilenet_full_api.exe`
-``` bash
+编译结果为当前目录下的 `Release\mobilenet_full_api.exe `
+``` dos
 # 2、执行预测
-Release\\mobilenet_full_api.exe ..\mobilenet_v1
+Release\mobilenet_full_api.exe mobilenet_v1
 ```
-`mobilenet_v1`为模型路径，`mobilenet_full_api.exe`为第一步编译出的可执行文件。
+下载并解压模型[`mobilenet_v1`](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)到当前`build`目录，执行以上命令进行预测。
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,7 @@ Welcome to Paddle-Lite's documentation!

  demo_guides/cpp_demo
  demo_guides/java_demo
+  demo_guides/python_demo
  demo_guides/android_app_demo
  demo_guides/ios_app_demo
  demo_guides/x86

--- a/docs/user_guides/library.md
+++ b/docs/user_guides/library.md

-# `build_extra`参数说明：
+# `with_extra`参数说明：

 Lite预测库分为**基础预测库**和**全量预测库(with_extra)**：基础预测库只包含基础CV算子（OP），体积较小；全量预测库包含所有Lite算子，体积较大，支持模型较多。


--- a/docs/user_guides/release_lib.md
+++ b/docs/user_guides/release_lib.md
@@ -3,49 +3,48 @@

 ## 编译版本介绍

- ARM_Version=`armv7/armv8`                        arm版本，可选择armv7或者armv8
+- arch=`armv7/armv8`                                       arm版本，可选择armv7或者armv8
+- arm_os=`android\ios\armlinux`    安装平台，支持的arm端移动平台包括 `ios`、`armlinux`和`android`
+- toolchain=`gcc/clang`                                 源码编译时的编译器，默认为`gcc`编译器
+- android_stl=`c++_static/c++_shared`     Lite预测库链接STL库的方式，支持静态或动态链接
+- with_extra=`ON/OFF`                                     是否编译全量OP，OFF时只编译CV相关基础OP，[参数详情](library)
+- with_cv=`ON/OFF`                                          是否编译编译Paddle-Lite CV 相关API

- arm_os=`android\ios\ios64\armlinux`   安装平台，支持的arm端移动平台包括 `ios\ios64`、`armlinux`和`android`

- arm_lang=`gcc/clang`                                  源码编译时的编译器，默认为`gcc`编译器
+## Android（toolchain=gcc）

- arm_stl=`c++_static/c++_shared`             Lite预测库链接STL库的方式，支持静态或动态链接
-
- build_extra=`ON/OFF`                                     是否编译全量OP，OFF时只编译CV相关基础OP，[参数详情](library)
-
-  `tiny_publish/full_publish`                   编译模式，`tiny_publish`编译移动端部署库、`full_publish`编译部署库的同时编译第三方依赖库
-
-
-## Android
-
-|ARM Version|build_extra|arm_stl|target|下载|
+| Arch  |with_extra|arm_stl|with_cv|下载|
 |:-------:|:-----:|:-----:|:-----:|:-------:|
-|armv7|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.tiny_publish.tar.gz)|
-|armv7|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.full_publish.tar.gz)|
-|armv7|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.tiny_publish.tar.gz)|
-|armv7|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.full_publish.tar.gz)|
-|armv7|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
-|armv7|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.full_publish.tar.gz)|
-|armv7|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
-|armv7|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.full_publish.tar.gz)|
-|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
-|armv8|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.full_publish.tar.gz)|
-|armv8|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.tiny_publish.tar.gz)|
-|armv8|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.full_publish.tar.gz)|
-|armv8|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
-|armv8|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.full_publish.tar.gz)|
-|armv8|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
-|armv8|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.full_publish.tar.gz)|
+|armv7|OFF|c++_shared|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_shared.tiny_publish.tar.gz)|
+|armv7|OFF|c++_shared|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_shared.with_cv.tiny_publish.tar.gz)|
+|armv7|ON|c++_shared|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
+|armv7|ON|c++_shared|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.with_cv.tiny_publish.tar.gz)|
+|armv7|OFF|c++_static|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_static.tiny_publish.tar.gz)|
+|armv7|OFF|c++_static|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_static.with_cv.tiny_publish.tar.gz)|
+|armv7|ON|c++_static|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
+|armv7|ON|c++_static|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.with_cv.tiny_publish.tar.gz)|
+|armv8|OFF|c++_shared|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_shared.tiny_publish.tar.gz)|
+|armv8|OFF|c++_shared|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_shared.with_cv.tiny_publish.tar.gz)|
+|armv8|ON|c++_shared|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
+|armv8|ON|c++_shared|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.with_cv.tiny_publish.tar.gz)|
+|armv8|OFF|c++_static|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
+|armv8|OFF|c++_static|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_static.with_cv.tiny_publish.tar.gz)|
+|armv8|ON|c++_static|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
+|armv8|ON|c++_static|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/Android/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv.tiny_publish.tar.gz)|


 ## iOS

-|ARM Version|arm_os|with_extra|下载|
+|ARM Version|with_extra|with_cv|下载|
 |:-------:|:-----:|:-----:|:-----:|
-|armv7|ios|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.tar.gz)|
-|armv7|ios|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.with_extra.tar.gz)|
-|armv8|ios64|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.tar.gz)|
-|armv8|ios64|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.with_extra.tar.gz)|
+|armv7|OFF|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv7.tiny_publish.tar.gz)|
+|armv7|OFF|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv7.with_cv.tiny_publish.tar.gz)|
+|armv7|ON|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv7.with_cv.with_extra.tiny_publish.tar.gz)|
+|armv7|ON|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv7.with_extra.tiny_publish.tar.gz)|
+|armv8|OFF|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv8.tiny_publish.tar.gz)|
+|armv8|OFF|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv8.with_cv.tiny_publish.tar.gz)|
+|armv8|ON|OFF|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv8.with_cv.with_extra.tiny_publish.tar.gz)|
+|armv8|ON|ON|[release/v2.6](https://paddlelite-data.bj.bcebos.com/Release/2.6.0/iOS/inference_lite_lib.ios.armv8.with_extra.tiny_publish.tar.gz)|


 ## opt 工具
@@ -55,7 +54,13 @@
 |    Linux    | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) |
 |    MacOs   | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |

+## 安装Paddle-Lite python 库方法

+- 支持平台： windows10、Ubuntu、Mac
+- python version: 2.7、3.5、3.6、 3.7
+```
+pip install paddlelite
+```

 ## 对应源码编译方法


--- a/docs/user_guides/source_compile.md
+++ b/docs/user_guides/source_compile.md
@@ -10,11 +10,12 @@ PaddleLite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`

 ## 一、环境准备

-目前支持三种编译的环境：
+目前支持四种编译的环境：

 1. Docker 容器环境，
 2. Linux（推荐 Ubuntu 16.04）环境，
-3. Mac OS 环境。
+3. Mac OS 环境，
+4. [Windows 环境](../demo_guides/x86.html#windows)

 ### 1、 Docker开发环境


--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -224,11 +224,11 @@ if (LITE_WITH_X86)
        add_dependencies(publish_inference publish_inference_x86_cxx_lib)

        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party/mklml"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install/mklml" "${INFER_LITE_PUBLISH_ROOT}/third_party/mklml"
            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
        )
        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
@@ -327,7 +327,6 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
-                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                endif()
            endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
-
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
@@ -282,17 +281,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
            --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
    add_dependencies(test_resnet50 extern_lite_download_resnet50_tar_gz)

-    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       FPGA_DEPS ${fpga_kernels})
-
-    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       FPGA_DEPS ${fpga_kernels})
-
-    lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
+    lite_cc_test(test_resnet50_fpga SRCS resnet50_test_fpga.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels})
@@ -304,10 +293,6 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)

-   lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
-      DEPS ${lite_model_test_DEPS})
-
-
   # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
   #    DEPS ${lite_model_test_DEPS})

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -91,6 +91,8 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
  }
  std::vector<Place> vaild_places = {
      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kInt32)},
+      Place{TARGET(kARM), PRECISION(kInt64)},
  };
  config.set_valid_places(vaild_places);
  auto predictor = lite_api::CreatePaddlePredictor(config);

--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -7,20 +7,8 @@ if(WIN32)
   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(lite_pybind ${os_dependency_modules})
-elseif(APPLE)
-   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
-   set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
-   set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
-   add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-   set_target_properties(lite_pybind PROPERTIES LINK_FLAGS ${LINK_FLAGS})
-   add_dependencies(lite_pybind custom_linker_map)
 else()
   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
-   set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-   set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
-   add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-   set_target_properties(lite_pybind PROPERTIES LINK_FLAGS ${LINK_FLAGS})
-   add_dependencies(lite_pybind custom_linker_map)
 endif(WIN32)

 if (LITE_ON_TINY_PUBLISH)

--- a/lite/api/python/setup_mac.py.in
+++ b/lite/api/python/setup_mac.py.in
@@ -47,7 +47,7 @@ if '${WITH_MKL}' == 'ON':
    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']

 # link lite.so to paddlelite.libs
-COMMAND = "install_name_tool -id \"@loader_path/libs/\" ${PADDLE_BINARY_DIR}\
+COMMAND = "install_name_tool -add_rpath \"@loader_path/libs/\" ${PADDLE_BINARY_DIR}\
 /inference_lite_lib/python/install/lite/lite.so"
 if os.system(COMMAND) != 0:
    raise Exception("patch third_party libs failed, command: %s" % COMMAND)

--- a/lite/core/mir/fusion/conv_elementwise_fuser.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuser.cc
@@ -30,7 +30,8 @@ void ConvElementwiseFuser::BuildPattern() {
  auto* bias = VarNode("bias")
                   ->assert_is_op_input("elementwise_add", "Y")
                   ->AsInput()
-                   ->assert_is_persistable_var();
+                   ->assert_is_persistable_var()
+                   ->assert_only_one_output();

  // create op nodes
  auto* conv2d = OpNode("conv2d", conv_type_)->assert_is_op(conv_type_);

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -225,8 +225,8 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,

 #ifndef LITE_WITH_FPGA
  op_desc.SetAttr("enable_int8", true);
-
 #endif
+
  op_desc.SetAttr("weight_scale", weight_scale);

  // change the weight from the float type to int8 type.

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -105,6 +105,7 @@ class ChannelWiseDequantOpFuser : public FuseBase {
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
 public:
+
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

@@ -118,6 +119,7 @@ class DynamicQuantDequantOpFuser : public FuseBase {
                                      const std::string& op_type,
                                      int i)
      : op_type_(op_type), quant_type_(quantized_op_type), times_(i) {}
+
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;


--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -364,6 +364,11 @@ PMNode *PMNode::assert_is_op() {
  return this;
 }

+PMNode *PMNode::assert_only_one_output() {
+  asserts_.emplace_back([](const Node *x) { return x->outlinks.size() == 1; });
+  return this;
+}
+
 PMNode *PMNode::assert_is_op(const std::string &op_type) {
  asserts_.emplace_back([op_type](const Node *x) {
    if (x && x->IsStmt()) {

--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
@@ -127,6 +127,7 @@ struct PMNode {
  PMNode* assert_is_var();
  PMNode* assert_var_not_persistable();
  PMNode* assert_is_persistable_var();
+  PMNode* assert_only_one_output();
  PMNode* assert_is_op_output(const std::string& op_type);
  PMNode* assert_is_op_input(const std::string& op_type);
  PMNode* assert_is_op_input(const std::string& op_type,

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -178,7 +178,6 @@ void SSAGraph::Build(const Program &program,
        arg_node->AsArg(name, node_storage_.size() - 1);
        arg_update_node_map_[name] = arg_node;
      }
-
      if (var_types.count(name)) {
        if (!arg_node->arg()->type) {
          arg_node->arg()->type = LiteType::GetTensorTy(
@@ -192,7 +191,6 @@ void SSAGraph::Build(const Program &program,
              "data_type", static_cast<int>(var_types[name]));
        }
      }
-
      if (is_weights(name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(arg_node, op_node);
@@ -202,12 +200,10 @@ void SSAGraph::Build(const Program &program,
      auto *arg_node = &node_storage_.back();
      arg_node->AsArg(name, node_storage_.size() - 1);
      arg_update_node_map_[name] = arg_node;
-      /*
      if (var_types.count(name) && !arg_node->arg()->type) {
        arg_node->arg()->type = LiteType::GetTensorTy(
            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
      }
-      */

      if (is_weights(name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -134,6 +134,7 @@ class Optimizer {

           "mlu_postprocess_pass"}};

+
      if (passes.size() == 1) {
        // multi_stream_analysis_pass must be in the front of
        // runtime_context_assign_pass

--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -24,10 +24,16 @@ namespace profile {

 namespace {
 auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
-  return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
-         (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+  if (c1.kernel_func_name == "NotImpl" && c2.kernel_func_name == "NotImpl") {
+    return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+           (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+  } else {  // compare with ch.kernel_func_name
+    return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+           (c1.kernel_name < c2.kernel_name) ||
+           (c1.kernel_func_name < c2.kernel_func_name);
+  }
 };
-}
+}  // namespace

 std::map<Type, std::string> TypeStr{
    {Type::kUnk, "Unknown"},
@@ -88,6 +94,36 @@ void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
 #endif
 }

+int Profiler::GetKernelFuncCalledTimes(const std::string& op_type,
+                                       const std::string& kernel_func_name) {
+  int count = 0;
+  for (size_t i = 0; i < units_.size(); ++i) {
+    if ((units_[i].character.kernel_func_name == kernel_func_name) &&
+        (units_[i].character.kernel_func_name != "NotImpl")) {
+      ++count;
+    } else if ((units_[i].character.kernel_func_name == "NotImpl") &&
+               (units_[i].character.op_type == op_type)) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+float Profiler::GetKernelFuncSummaryGOPs(const std::string& op_type,
+                                         const std::string& kernel_func_name) {
+  float GOPs = 0;
+  for (size_t i = 0; i < units_.size(); ++i) {
+    if ((units_[i].character.kernel_func_name == kernel_func_name) &&
+        (units_[i].character.kernel_func_name != "NotImpl")) {
+      GOPs += units_[i].character.macs;
+    } else if ((units_[i].character.kernel_func_name == "NotImpl") &&
+               (units_[i].character.op_type == op_type)) {
+      GOPs += units_[i].character.macs;
+    }
+  }
+  return GOPs * 1e-9f;
+}
+
 std::string Profiler::Summary(Type type, bool concise, size_t w) {
  using std::setw;
  using std::left;
@@ -108,13 +144,11 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
       << " warm-ups =====" << std::endl;
  }
  ss << setw(20) << left << "OperatorType"
-     << " " << setw(30) << left << "KerneAttr";
-  if (!concise) {
-    ss << " " << setw(24) << left << "KernelName";
-  }
-  ss << " " << setw(16) << left << "Remark";
+     << " " << setw(30) << left << "KerneAttr(Place)"
+     << " " << setw(24) << left << "KernelFuncName";
  if (!concise) {
-    ss << " " << setw(15) << left << "InDim"
+    ss << " " << setw(26) << left << "Remark"
+       << " " << setw(15) << left << "InDim"
       << " " << setw(15) << left << "FilterDim"
       << " " << setw(15) << left << "OutDim";
  }
@@ -124,10 +158,13 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
  if (!concise) {
    ss << " " << setw(7) << left << "Last(ms)";
  }
-  ss << " " << setw(7) << left << "Avg(%)";
+  ss << " " << setw(7) << left << "Avg(%)"
+     << " " << setw(7) << left << "GOPs";
  if (!concise) {
-    ss << " " << setw(7) << left << "GOPs"
-       << " " << setw(7) << left << "GOPS";
+    ss << " " << setw(7) << left << "GOPS";
+  }
+  if (concise) {
+    ss << " " << setw(11) << left << "CalledTimes";
  }
 #ifdef LITE_WITH_OPENCL
  ss << " " << setw(9) << left << "clAvg(ms)"
@@ -185,14 +222,20 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
      // clang-format off
      ss << setw(20) << left << fixed << item.first.op_type
         << " " << setw(30) << left << fixed << item.first.kernel_attr
-         << " " << setw(16) << left << fixed << item.first.remark
+         << " " << setw(24) << left << fixed << item.first.kernel_func_name
         << " " << setw(7) << left << fixed << setprecision(3)
         << item.second.avg
         << " " << setw(7) << left << fixed << setprecision(3)
         << item.second.min
         << " " << setw(7) << left << fixed << setprecision(3)
-          << item.second.max
-         << " " <<  setprecision(2) << percent << "%   ";
+         << item.second.max
+         << " " << setprecision(2) << percent << "%   "
+         << " " << setw(7) << left << fixed << setprecision(3)
+         << GetKernelFuncSummaryGOPs(item.first.op_type,
+                                     item.first.kernel_func_name)
+         << " " << setw(11) << left << fixed
+         << GetKernelFuncCalledTimes(item.first.op_type,
+                                     item.first.kernel_func_name);
 #ifdef LITE_WITH_OPENCL
      float cl_percent = 0;
      if (cl_total > 0) {
@@ -204,7 +247,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
         << item.second.cl_min
         << " " << setw(9) << left << fixed << setprecision(3)
         << item.second.cl_max
-         << " " << left << fixed <<setprecision(2) << cl_percent << "%   ";
+         << " " << left << fixed << setprecision(2) << cl_percent << "%   ";
 #endif
      ss << std::endl;
      // clang-format on
@@ -244,7 +287,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
         << " " << setw(30) << left << fixed << unit.Character().kernel_attr
         << " " << setw(24) << left << fixed
         << unit.Character().kernel_func_name
-         << " " << setw(16) << left << fixed << unit.Character().remark
+         << " " << setw(26) << left << fixed << unit.Character().remark
         << " " << setw(15) << left << fixed << unit.Character().input_shape
         << " " << setw(15) << left << fixed << unit.Character().filter_shape
         << " " << setw(15) << left << fixed << unit.Character().output_shape
@@ -253,7 +296,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
         << " " << setw(7) << left << fixed << setprecision(3) << times.Max(w)
         << " " << setw(7) << left << fixed << setprecision(3) << times.Last(w)
         << " " << left << setprecision(2) << percent << "%   "
-         << " " << setw(7) << left << fixed << setprecision(2)
+         << " " << setw(7) << left << fixed << setprecision(3)
                << 1e-9f * unit.Character().macs
         << " " << setw(7) << left << fixed << setprecision(2)
                << 1e-6f * unit.Character().macs / times.Avg(w);

--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
@@ -101,6 +101,10 @@ class Profiler final {
  void StartTiming(Type type, const int index, KernelContext* ctx);
  void StopTiming(Type type, const int index, KernelContext* ctx);
  std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
+  int GetKernelFuncCalledTimes(const std::string& op_type,
+                               const std::string& kernel_func_name);
+  float GetKernelFuncSummaryGOPs(const std::string& op_type,
+                                 const std::string& kernel_func_name);
  OpCharacter* GetOpCharacter(const size_t index);

 private:

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -73,7 +73,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
  std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
  auto var_size = main_block.VarsSize();
-  for (size_t i = 0; i < var_size; i++) {
+  for (int i = 0; i < var_size; i++) {
    auto v = main_block.GetVar<cpp::VarDesc>(i);
    auto name = v->Name();
    origin_var_maps.emplace(name, *v);
@@ -86,16 +86,12 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
    auto* scope = op->scope();
    auto in_names = op->op_info()->input_names();
    auto out_names = op->op_info()->output_names();
-
-    std::vector<std::string> var_names;
-    var_names.insert(var_names.end(), in_names.begin(), in_names.end());
-    var_names.insert(var_names.end(), out_names.begin(), out_names.end());
-    std::sort(var_names.begin(), var_names.end());
-    var_names.erase(std::unique(var_names.begin(), var_names.end()),
-                    var_names.end());
-
-    for (auto& var_name : var_names) {
-      auto it = origin_var_maps.find(var_name);
+    in_names.insert(in_names.end(), out_names.begin(), out_names.end());
+    std::sort(in_names.begin(), in_names.end());
+    in_names.erase(std::unique(in_names.begin(), in_names.end()),
+                   in_names.end());
+    for (auto& in_name : in_names) {
+      auto it = origin_var_maps.find(in_name);
      if (it != origin_var_maps.end()) {
        auto* v = main_block.AddVar<cpp::VarDesc>();
        v->SetName((it->second).Name());
@@ -108,30 +104,37 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
      } else {
        // New created vars must be LOD_TENSOR
        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName(var_name);
+        v->SetName(in_name);
        v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
        std::string in_arg_name;
-        op->op_info()->GetInputArgname(var_name, &in_arg_name);
-        auto type = kernel->GetInputDeclType(in_arg_name);
+        const Type* type;
+        if (op->op_info()->GetInputArgname(in_name, &in_arg_name)) {
+          type = kernel->GetInputDeclType(in_arg_name);
+        } else {
+          op->op_info()->GetOutputArgname(in_name, &in_arg_name);
+          type = kernel->GetOutputDeclType(in_arg_name);
+        }
        if (type->IsTensor()) {
-          auto tensor = scope->FindVar(var_name)->GetMutable<Tensor>();
+          auto tensor = scope->FindVar(in_name)->GetMutable<Tensor>();
          v->SetPersistable(tensor->persistable());
-          if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") {
+          if (in_name != "feed" && in_name != "fetch") {
            v->SetShape(tensor->dims().data());
            switch (tensor->precision()) {
-#define SET_DATATYPE(precision__, data_type) \
-  case PrecisionType::precision__:           \
-    v->SetDataType(data_type);               \
+#define SET_DATATYPE(precision__, data_type)                    \
+  case PrecisionType::precision__:                              \
+    v->SetDataType(data_type);                                  \
+    LOG(INFO) << "update var" << (it->second).Name() << "done"; \
    break
-
+              SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL);
              SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32);
+              SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16);
              SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8);
              SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16);
              SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32);
              SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64);
 #undef SET_DATATYPE
              default:
-                LOG(FATAL) << "unknown precision type";
+                VLOG(4) << "warning! unknown precision type";
            }
          }
        } else {
@@ -141,7 +144,6 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
    }
  }
 }
-
 void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PRECISION_PROFILE
  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
@@ -153,12 +155,6 @@ void RuntimeProgram::Run() {
 #ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
 #endif
-
-    std::string op_type = inst.op()->op_info()->Type();
-
-    VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
-            << " on Target " << TargetToStr(inst.kernel()->target());
-
 #ifdef LITE_WITH_CUDA
    if (inst.need_sync()) {
      inst.Sync();

--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
@@ -6,16 +6,44 @@ set(TARGET mobilenet_full_api)
 set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
 set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")

+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+  if (MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  endif()
+endif()
+
 # 2. link mklml and Paddle-Lite directory
 link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
 include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)

 # 3. compile options 
-add_definitions(-std=c++11 -g -O3 -pthread)
-set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()

 # 4.add executable output
 add_executable(${TARGET} ${TARGET}.cc)
-target_link_libraries(${TARGET} -lpaddle_full_api_shared)
-target_link_libraries(${TARGET} -liomp5)
-target_link_libraries(${TARGET} -ldl)
+if (WIN32)
+  set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+      ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+
+
+  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+   add_custom_command(TARGET ${TARGET} POST_BUILD
+       COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+       COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+   )
+else()
+    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+    target_link_libraries(${TARGET} -liomp5)
+    target_link_libraries(${TARGET} -ldl)
+endif()
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/build.bat
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.bat
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+
+set source_path=%~dp0
+
+set build_directory=%source_path%\build
+
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+) 
+
+md "%build_directory%"
+set vcvarsall_dir=C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat
+
+IF NOT EXIST "%vcvarsall_dir%" (
+  goto set_vcvarsall_dir
+) else (
+  goto cmake
+)
+
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "!vcvarsall_dir!" (
+    echo "------------!vcvarsall_dir! not exist------------"
+    goto set_vcvarsall_dir
+)
+
+:cmake
+D:
+cd "%build_directory%"
+
+cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64
+
+call "%vcvarsall_dir%" amd64
+
+msbuild /maxcpucount:8 /p:Configuration=Release  mobilenet_full_api.vcxproj
+
+goto:eof
+
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
@@ -16,6 +16,11 @@
 #include <vector>
 #include "paddle_api.h"  // NOLINT

+#ifdef _WIN32
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
+
 using namespace paddle::lite_api;  // NOLINT

 int64_t ShapeProduction(const shape_t& shape) {

--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
@@ -6,16 +6,44 @@ set(TARGET mobilenet_light_api)
 set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
 set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")

+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+  if (MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  endif()
+endif()
+
 # 2. link mklml and Paddle-Lite directory
 link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
 include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)

 # 3. compile options 
-add_definitions(-std=c++11 -g -O3 -pthread)
-set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()

 # 4.add executable output
 add_executable(${TARGET} ${TARGET}.cc)
-target_link_libraries(${TARGET} -lpaddle_light_api_shared)
-target_link_libraries(${TARGET} -liomp5)
-target_link_libraries(${TARGET} -ldl)
+if (WIN32)
+  set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+      ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+
+
+  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+   add_custom_command(TARGET ${TARGET} POST_BUILD
+       COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+       COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+   )
+else()
+    target_link_libraries(${TARGET} -lpaddle_light_api_shared)
+    target_link_libraries(${TARGET} -liomp5)
+    target_link_libraries(${TARGET} -ldl)
+endif()
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/build.bat
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.bat
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+
+set source_path=%~dp0
+
+set build_directory=%source_path%\build
+
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+) 
+
+md "%build_directory%"
+set vcvarsall_dir=C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat
+
+IF NOT EXIST "%vcvarsall_dir%" (
+  goto set_vcvarsall_dir
+) else (
+  goto cmake
+)
+
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "!vcvarsall_dir!" (
+    echo "------------!vcvarsall_dir! not exist------------"
+    goto set_vcvarsall_dir
+)
+
+:cmake
+D:
+cd "%build_directory%"
+
+cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64
+
+call "%vcvarsall_dir%" amd64
+
+msbuild /maxcpucount:8 /p:Configuration=Release  mobilenet_light_api.vcxproj
+
+goto:eof
+
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
--- a/lite/kernels/arm/conv_compute.h
+++ b/lite/kernels/arm/conv_compute.h
@@ -15,6 +15,9 @@
 #pragma once
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -36,6 +39,13 @@ class ConvCompute : public KernelLite<TARGET(kARM), Ptype> {
    impl_->Run();
  }

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    impl_->SetProfileRuntimeKernelInfo(ch);
+  }
+#endif
+
  ~ConvCompute() {
    if (impl_ != nullptr) {
      delete impl_;

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -50,6 +50,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
      flag_trans_weights_ = true;
    }
    impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_fp32";
+#endif
  } else if (kw == 5) {
    // VLOG(5) << "invoke 5x5 dw conv fp32";
    auto strides = param.strides;
@@ -67,6 +70,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
          w_data_in, w_data, oc, 1, cblock, kh * kw);
      flag_trans_weights_ = true;
      impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_depthwise_5x5_fp32";
+#endif
    } else {
      LOG(FATAL)
          << "5x5 depthwise conv only support stride == 1 or stride == 2";
@@ -103,6 +109,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -113,6 +122,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -162,6 +174,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -172,6 +187,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -183,6 +201,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -225,6 +251,14 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
        w_scale_.data());
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -267,6 +301,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
        w_scale_.data());
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();

--- a/lite/kernels/arm/conv_depthwise.h
+++ b/lite/kernels/arm/conv_depthwise.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/core/context.h"
@@ -48,6 +49,15 @@ class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvDw"};
+#endif
+
 private:
  using param_t = operators::ConvParam;
  Tensor weights_;

--- a/lite/kernels/arm/conv_direct.cc
+++ b/lite/kernels/arm/conv_direct.cc
@@ -19,6 +19,14 @@ namespace lite {
 namespace kernels {
 namespace arm {

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -62,6 +70,9 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_fp32";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_fp32(i_data,
                                            o_data,
@@ -76,9 +87,20 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_fp32";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -117,6 +139,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_int8";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_int8(i_data,
                                            o_data,
@@ -132,9 +157,20 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_int8";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();
@@ -173,6 +209,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_int8";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_int8(i_data,
                                            o_data,
@@ -188,6 +227,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_int8";
+#endif
  }
 }


--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/context.h"
@@ -180,6 +181,15 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {

  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvDirect"};
+#endif
+
  /// todo, support inplace weights transform
 protected:
  Tensor weights_;

--- a/lite/kernels/arm/conv_gemmlike.cc
+++ b/lite/kernels/arm/conv_gemmlike.cc
@@ -81,6 +81,14 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -111,12 +119,26 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  if (flag_1x1gemm_) {
    lite::arm::math::conv1x1s1_gemm(
        din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm(
        din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -159,6 +181,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                         param,
                                         &ctx,
                                         w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm_int8";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm_int8(din,
                                           dout,
@@ -174,9 +199,20 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                           param,
                                           &ctx,
                                           w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm_int8";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();
@@ -219,6 +255,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                         param,
                                         &ctx,
                                         w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm_int8";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm_int8(din,
                                           dout,
@@ -234,6 +273,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                           param,
                                           &ctx,
                                           w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm_int8";
+#endif
  }
 }


--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/backends/arm/math/funcs.h"
@@ -94,6 +95,15 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvGemm"};
+#endif
+
  /// todo, support inplace weights transform
 protected:
  using param_t = operators::ConvParam;

--- a/lite/kernels/arm/conv_transpose_compute.h
+++ b/lite/kernels/arm/conv_transpose_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <string>
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/conv_transpose_op.h"
@@ -33,6 +34,14 @@ class Conv2DTransposeCompute

  ~Conv2DTransposeCompute() = default;

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvTranspose"};
+#endif
+
 protected:
  int workspace_size_{0};
 };

--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -94,6 +94,14 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  ReInitWhenNeeded();
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -130,6 +138,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                          b_data,
                                          param,
                                          &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_compute_6x6_3x3";
+#endif
  } else {
    int tile_block = 8;
    int block_count =
@@ -148,6 +159,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_compute_2x2_3x3";
+#endif
    } else {
      lite::arm::math::conv_compute_2x2_3x3_small(i_data,
                                                  o_data,
@@ -162,6 +176,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                                  b_data,
                                                  param,
                                                  &ctx);
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_compute_2x2_3x3_small";
+#endif
    }
  }
 }

--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/core/context.h"
 #include "lite/core/kernel.h"
@@ -34,6 +35,13 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void ReInitWhenNeeded();
  virtual void Run();
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvWino"};
+#endif

 protected:
  using param_t = operators::ConvParam;

--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -66,5 +66,5 @@ REGISTER_LITE_KERNEL(fill_constant,
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("ShapeTensorList",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
-# if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
-#     return()
-# endif()
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
+    return()
+endif()

 set(fpga_deps fpga_target_wrapper kernel_fpga)


--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -5,11 +5,6 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(unsqueeze_compute_host Host basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
-
-#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
-#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
-
 add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
@@ -20,3 +15,4 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li
 add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
@@ -13,10 +13,6 @@
 // limitations under the License.

 #include "lite/kernels/host/assign_compute.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/type_system.h"

 namespace paddle {
 namespace lite {

--- a/lite/kernels/npu/bridges/dropout_op.cc
+++ b/lite/kernels/npu/bridges/dropout_op.cc
@@ -34,8 +34,6 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_name = op_info->Input("X").front();
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
-  auto x_rank = x_dims.size();
-  CHECK_GE(x_rank, 2);

  auto out_name = op_info->Output("Out").front();

@@ -45,9 +43,6 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (dropout_implementation == "upscale_in_train") {
    scale = 1.f;
  }
-  // HiAI only support [n, c, 1, 1] for the shape of scale
-  std::vector<int64_t> scale_shape = {
-      1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1};

  // X node
  std::shared_ptr<Node> x_node = nullptr;
@@ -61,11 +56,7 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto scale_node = graph->Add<ge::op::Scale>(out_name);
  auto scale_op = scale_node->data<ge::op::Scale>();
  scale_op->set_input_x(*x_node->data());
-  scale_op->set_attr_axis(1);
-
-  // Add filter node(fill with scale)
-  auto filter_node = graph->Add(out_name + "/filter", scale, scale_shape);
-  scale_op->set_input_filter(*filter_node->data());
+  scale_op->set_attr_filler_value(scale);

  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -120,7 +120,9 @@ int SubgraphEngine::BuildDeviceProgram() {
    return subgraph::FAILED;
  }
  auto device_program = std::make_shared<device_program_t>(device_client);
-  device_program_map_[inputs_shape_] = device_program;
+  if (!inputs_shape_.empty()) {
+    device_program_map_[inputs_shape_] = device_program;
+  }

  // Query and check the dimensions of valid input and output tensors
  std::vector<hiai::TensorDimension> device_idims, device_odims;

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -324,7 +324,7 @@ void SaveCombinedParamsPb(const std::string &path,
  std::sort(paramlist.begin(), paramlist.end());

  // Load vars
-  std::ofstream file(path);
+  std::ofstream file(path, std::ios::binary);
  CHECK(file.is_open());
  for (size_t i = 0; i < paramlist.size(); ++i) {
    SerializeTensor(file, exec_scope, paramlist[i]);

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -158,8 +158,6 @@ add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS
 add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})

-add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
-
 if (NOT LITE_WITH_X86)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc
                DEPS fc_op memory

--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -15,6 +15,9 @@
 #pragma once
 #include <string>
 #include "lite/core/op_lite.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/api/paddle_place.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -34,6 +37,58 @@ class ActivationOp : public OpLite {

  std::string DebugString() const override { return "activation_op"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter* ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = ActivationTypeToStr(param_.active_type);
+    switch (param_.active_type) {
+      case lite_api::ActivationType::kRelu:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kRelu6:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kPRelu:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kSwish:
+        ch->macs = param_.X->numel() * 4.0;
+        break;
+      case lite_api::ActivationType::kSigmoid:
+        ch->macs = param_.X->numel() * 3.0;
+        break;
+      case lite_api::ActivationType::kTanh:
+        ch->macs = param_.X->numel() * 5.0;
+        break;
+      case lite_api::ActivationType::kExp:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kAbs:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kHardSwish:
+        ch->macs = param_.X->numel() * 5.0;
+        break;
+      case lite_api::ActivationType::kReciprocal:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kIndentity:
+        break;
+      default:
+        LOG(FATAL) << "This Type of Activation:"
+                   << static_cast<int>(param_.active_type)
+                   << ActivationTypeToStr(param_.active_type)
+                   << " doesn't support";
+    }
+  }
+#endif
+
 private:
  mutable operators::ActivationParam param_;
 };

--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
@@ -39,6 +39,17 @@ class AffineChannelOpLite : public OpLite {

  std::string DebugString() const override { return "affine_channel"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = param_.data_layout;
+    ch->macs = param_.X->numel() * 2.0;
+  }
+#endif
+
 private:
  mutable AffineChannelParam param_;
 };

--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
@@ -39,6 +39,27 @@ class ArgmaxOpLite : public OpLite {

  std::string DebugString() const override { return "argmax"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "axis" + std::to_string(param_.Axis);
+
+    auto axis = param_.Axis;
+    if (axis < 0) {
+      axis += input_dims.size();
+    }
+    int max_num = 1;
+    for (int64_t i = axis + 1; i < input_dims.size(); i++)
+      max_num *= input_dims[i];
+    float gops = 1.0f;
+    for (int i = 1; i <= max_num; i++) gops *= i;
+    ch->macs = gops * output_dims.production();
+  }
+#endif
+
 private:
  mutable ArgmaxParam param_;
 };

--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
@@ -37,6 +37,17 @@ class AssignOpLite : public OpLite {
  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override { return "assign"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    // ch->remark = "";
+    ch->macs = param_.X->numel() * 1.0;
+  }
+#endif
+
 private:
  mutable AssignParam param_;
 };

--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
@@ -39,6 +39,17 @@ class AssignValueOpLite : public OpLite {

  std::string DebugString() const override { return "assign value"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    // auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    // ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "dtype" + std::to_string(param_.dtype);
+    ch->macs = param_.Out->numel() * 1.0;
+  }
+#endif
+
 private:
  mutable AssignValueParam param_;
 };

--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
@@ -39,6 +39,17 @@ class AxpyOpLite : public OpLite {

  std::string DebugString() const override { return "axpy"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    // ch->remark = "";
+    ch->macs = param_.X->numel() * 2.0;
+  }
+#endif
+
 private:
  mutable AxpyParam param_;
 };

--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
@@ -37,6 +37,17 @@ class BatchNormOp : public OpLite {
  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override { return "batch_norm"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.x->dims();
+    auto output_dims = param_.y->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    // ch->remark = "";
+    ch->macs = param_.y->numel() * 2.0;
+  }
+#endif
+
 private:
  mutable BatchNormParam param_;
 };

--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
@@ -39,6 +39,17 @@ class BoxClipOpLite : public OpLite {

  std::string DebugString() const override { return "box clip"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.Input->dims();
+    auto output_dims = param_.Output->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    // ch->remark = "";
+    ch->macs = param_.Output->numel() * 2.0;
+  }
+#endif
+
 private:
  mutable BoxClipParam param_;
 };

--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
@@ -34,8 +34,21 @@ class BoxCoderOpLite : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "box_coder"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    // auto input_dims = param_.Input->dims();
+    // auto output_dims = param_.Output->dims();
+    // ch->input_shape = ch->DimToStr(input_dims);
+    // ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "proposals" + std::to_string(param_.proposals->dims()[0]) +
+                 "x" + std::to_string(param_.proposals->dims()[1]);
+    ch->macs = param_.proposals->dims()[0] * param_.proposals->dims()[1] * 30.f;
+  }
+#endif
+
 private:
  mutable BoxCoderParam param_;
 };

--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
@@ -50,6 +50,17 @@ class CalibOpLite : public OpLite {

  std::string DebugString() const override { return "calib"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.input->dims();
+    auto output_dims = param_.output->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "scale" + std::to_string(param_.scale);
+    ch->macs = param_.output->numel() * 1.0f;
+  }
+#endif
+
 private:
  mutable CalibParam param_;
 };

--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
@@ -38,6 +38,18 @@ class CompareOp : public OpLite {

  std::string DebugString() const override { return "binary logical"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = "X:" + ch->DimToStr(param_.X->dims()) + "Y:" +
+                      ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "axis" + std::to_string(param_.axis) + "force_cpu" +
+                 std::to_string(param_.force_cpu);
+    ch->macs = param_.Out->numel() * 1.0f;
+  }
+#endif
+
 private:
  mutable CompareParam param_;
 };

--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
@@ -37,6 +37,21 @@ class ConcatOpLite : public OpLite {
  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override { return "concat"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto output_dims = param_.output->dims();
+    std::string inputs_shape = "";
+    for (size_t i = 0; i < param_.x.size(); ++i) {
+      inputs_shape += ch->DimToStr(param_.x[i]->dims());
+      if (i != param_.x.size() - 1) inputs_shape += "/";
+    }
+    ch->input_shape = inputs_shape;
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "axis" + std::to_string(param_.axis);
+    ch->macs = 0.f;  // no calc. only io operation
+  }
+#endif
+
 private:
  mutable ConcatParam param_;
 };

--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -22,6 +22,9 @@
 #include "lite/core/tensor.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/all.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/api/paddle_place.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -44,12 +47,13 @@ class ConvOpLite : public OpLite {
    ch->input_shape = ch->DimToStr(input_dims);
    ch->output_shape = ch->DimToStr(output_dims);
    ch->filter_shape = ch->DimToStr(filter_dims);
-    ch->remark = std::to_string(filter_dims[2]) + "x" +
-                 std::to_string(filter_dims[3]) + "p" +
-                 std::to_string((*param_.paddings)[0]) + "s" +
-                 std::to_string(param_.strides[0]) + "g" +
-                 std::to_string(param_.groups) + "d" +
-                 std::to_string((*param_.dilations)[0]);
+    ch->remark =
+        std::to_string(filter_dims[2]) + "x" + std::to_string(filter_dims[3]) +
+        "p" + std::to_string((*param_.paddings)[0]) + "s" +
+        std::to_string(param_.strides[0]) + "g" +
+        std::to_string(param_.groups) + "d" +
+        std::to_string((*param_.dilations)[0]) + (param_.bias ? "Bias" : "") +
+        ActivationTypeToStr(param_.activation_param.active_type);
    // MACs = 2.f * kw * kh * batchsize * out_c * out_h * out_w * in_c / group
    // GMACs = 1e-9f * MACs
    // GMACPS = 1e-6f * MACs / predict_ms

--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -21,6 +21,9 @@
 #include "lite/core/tensor.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/all.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/api/paddle_place.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -42,6 +45,29 @@ class ConvTransposeOpLite : public OpLite {

  std::string DebugString() const override { return "conv_transpose"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto filter_dims = param_.filter->dims();
+    auto input_dims = param_.x->dims();
+    auto output_dims = param_.output->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->filter_shape = ch->DimToStr(filter_dims);
+    ch->remark =
+        std::to_string(filter_dims[2]) + "x" + std::to_string(filter_dims[3]) +
+        "p" + std::to_string((*param_.paddings)[0]) + "s" +
+        std::to_string(param_.strides[0]) + "g" +
+        std::to_string(param_.groups) + "d" +
+        std::to_string((*param_.dilations)[0]) + (param_.bias ? "Bias" : "") +
+        ActivationTypeToStr(param_.activation_param.active_type);
+    // MACs = 2.f * kw * kh * batchsize * out_c * out_h * out_w * in_c / group
+    // GMACs = 1e-9f * MACs
+    // GMACPS = 1e-6f * MACs / predict_ms
+    ch->macs = 2.f * filter_dims[2] * filter_dims[3] *
+               output_dims.production() * input_dims[1] / param_.groups;
+  }
+#endif
+
 private:
  mutable ConvParam param_;
  std::string padding_algorithm_{""};

--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -35,6 +35,17 @@ class ElementwiseOp : public OpLite {

  std::string DebugString() const override { return "elementwise_op"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter* ch) {
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = "X" + ch->DimToStr(param_.X->dims()) + "Y" +
+                      ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "axis" + std::to_string(param_.axis);
+    ch->macs = 1.0f * param_.Out->numel();
+  }
+#endif
+
 private:
  mutable operators::ElementwiseParam param_;
 };

--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -43,6 +43,17 @@ class FcOpLite : public OpLite {

  std::string DebugString() const override { return "fc"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto m = param_.input->dims().count(0, param_.in_num_col_dims);
+    ch->input_shape = ch->DimToStr(param_.input->dims());
+    ch->filter_shape = ch->DimToStr(param_.w->dims());
+    ch->output_shape = ch->DimToStr(param_.output->dims());
+    ch->remark = (param_.bias ? "Bias" : "") + param_.activation_type;
+    ch->macs = m * param_.w->dims()[0] * param_.w->dims()[1] * 3.0f;
+  }
+#endif
+
 private:
  mutable FcParam param_;
 };

--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
@@ -38,6 +38,15 @@ class IncrementOp : public OpLite {

  std::string DebugString() const override { return "increment"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "step" + std::to_string(param_.step);
+    ch->macs = param_.X->numel() * 1.0f;
+  }
+#endif
+
 private:
  mutable IncrementParam param_;
 };

--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
@@ -36,8 +36,22 @@ class InstanceNormOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "instance_norm"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->output_shape = ch->DimToStr(param_.out->dims());
+    // ch->remark = "";
+    auto x_dims = param_.x->dims();
+    auto nc = x_dims[0] * x_dims[1];
+    auto hw = x_dims[2] * x_dims[3];
+    auto nchw = x_dims.production();
+    ch->macs = 5.f * nchw + 3.f * (nc + hw);
+  }
+#endif
+
 private:
  mutable InstanceNormParam param_;
 };

--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
@@ -36,8 +36,18 @@ class InterpolateOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "interpolate"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = param_.interp_method;
+    ch->macs = param_.Out->numel() * 14.f;
+  }
+#endif
+
 private:
  mutable InterpolateParam param_;
 };

--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
@@ -38,6 +38,15 @@ class LayerNormOp : public OpLite {

  std::string DebugString() const override { return "layer_norm"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Y->dims());
+    ch->remark = "begin_norm_axis" + std::to_string(param_.begin_norm_axis);
+    ch->macs = param_.Y->numel() * 7.f;
+  }
+#endif
+
 private:
  mutable LayerNormParam param_;
 };

--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
@@ -38,6 +38,16 @@ class BinaryLogicalOp : public OpLite {

  std::string DebugString() const override { return "binary logical"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = "X" + ch->DimToStr(param_.X->dims()) + "Y" +
+                      ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    // ch->remark = "";
+    ch->macs = param_.Out->numel() * 3.f;
+  }
+#endif
+
 private:
  mutable LogicalParam param_;
 };
@@ -57,6 +67,16 @@ class UnaryLogicalOp : public OpLite {

  std::string DebugString() const override { return "binary logical"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = "X" + ch->DimToStr(param_.X->dims()) + "Y" +
+                      ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    // ch->remark = "";
+    ch->macs = param_.Out->numel() * 3.f;
+  }
+#endif
+
 private:
  mutable LogicalParam param_;
 };

--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
@@ -33,8 +33,18 @@ class LrnOpLite : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "lrn"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "n" + std::to_string(param_.n) + param_.norm_region;
+    ch->macs = param_.Out->numel() * param_.k * 2.f;
+  }
+#endif
+
 private:
  mutable LrnParam param_;
 };

--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
@@ -41,6 +41,31 @@ class MatMulOpLite : public OpLite {

  std::string DebugString() const override { return "matmul"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->filter_shape = ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "alpha" + std::to_string(param_.alpha) + "trans_x" +
+                 std::to_string(param_.transpose_X) + "trans_y" +
+                 std::to_string(param_.transpose_Y);
+
+    auto x_dims = param_.X->dims();
+    auto y_dims = param_.Y->dims();
+    auto m = x_dims[x_dims.size() - 2];
+    auto k = x_dims[x_dims.size() - 1];
+    auto n = y_dims[y_dims.size() - 1];
+    if (param_.transpose_X) {
+      m = x_dims[x_dims.size() - 1];
+      k = x_dims[x_dims.size() - 2];
+    }
+    if (param_.transpose_Y) {
+      n = y_dims[y_dims.size() - 2];
+    }
+    ch->macs = 3.f * m * n * k;
+  }
+#endif
+
 private:
  mutable MatMulParam param_;
 };

--- a/lite/operators/mean_op.h
+++ b/lite/operators/mean_op.h
@@ -35,6 +35,15 @@ class MeanOp : public OpLite {

  std::string DebugString() const override { return "mean"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    // ch->remark = "";
+    ch->macs = param_.X->numel() * 1.f;
+  }
+#endif
+
 private:
  mutable operators::MeanParam param_;
 };

--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -63,6 +63,20 @@ class MulOpLite : public OpLite {

  std::string DebugString() const override { return "mul"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->filter_shape = ch->DimToStr(param_.y->dims());
+    ch->output_shape = ch->DimToStr(param_.output->dims());
+    // ch->remark = "";
+    auto x_dims = param_.x->dims();
+    auto y_dims = param_.y->dims();
+    auto x_mat_dims = x_dims.Flatten2D(param_.x_num_col_dims);
+    auto y_mat_dims = y_dims.Flatten2D(param_.y_num_col_dims);
+    ch->macs = 1.f * x_mat_dims[0] * x_mat_dims[1] * y_mat_dims[1];
+  }
+#endif
+
 private:
  mutable MulParam param_;
 };

--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
@@ -35,8 +35,18 @@ class NegativeOpLite : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "negative"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    // ch->remark = "";
+    ch->macs = 1.f * param_.Out->numel();
+  }
+#endif
+
 private:
  mutable NegativeParam param_;
 };

--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
@@ -36,8 +36,18 @@ class PowerOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "power"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    // ch->remark = "";
+    ch->macs = param_.Out->numel() * 3.0f;
+  }
+#endif
+
 private:
  mutable PowerParam param_;
 };

--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
@@ -32,8 +32,29 @@ class ReduceMaxOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "reduce_max"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "keep_dim" + std::to_string(param_.keep_dim);
+
+    auto dims = param_.dim;
+    auto in_sum = param_.X->numel();
+    if (dims.size() == 0 || dims.size() == 1) {
+      ch->macs = 1.f * in_sum;
+    } else if (dims.size() == 2) {
+      ch->macs = 2.f * in_sum;
+    } else {
+      LOG(FATAL) << "This dims size of ReduceMaxParm: " << dims.size()
+                 << " doesn't support";
+      ch->macs = 0.f;
+    }
+  }
+#endif
+
 private:
  mutable ReduceMaxParam param_;
 };

--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
@@ -26,14 +26,41 @@ namespace operators {
 class ReduceMeanOp : public OpLite {
 public:
  ReduceMeanOp() {}
+
  explicit ReduceMeanOp(const std::string &op_type) : OpLite(op_type) {}
+
  bool CheckShape() const override;
+
  bool InferShapeImpl() const override;
+
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "reduce_mean"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "keep_dim" + std::to_string(param_.keep_dim);
+
+    auto dims = param_.dim;
+    auto in_sum = param_.X->numel();
+    if (dims.size() == 0) {
+      ch->macs = 1.f * in_sum;
+    } else if (dims.size() == 1) {
+      ch->macs = 2.f * in_sum;
+    } else if (dims.size() == 2) {
+      ch->macs = 4.f * in_sum;
+    } else {
+      LOG(FATAL) << "This dims size of ReduceMean: " << dims.size()
+                 << " doesn't support";
+      ch->macs = 0.f;
+    }
+  }
+#endif
+
 private:
  mutable ReduceMeanParam param_;
 };

--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
@@ -37,6 +37,27 @@ class ReduceProdOpLite : public OpLite {

  std::string DebugString() const override { return "reduce_prod"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->output_shape = ch->DimToStr(param_.output->dims());
+    ch->remark = "keep_dim" + std::to_string(param_.keep_dim) + "reduce_all" +
+                 std::to_string(param_.reduce_all);
+
+    auto dims = param_.dim;
+    auto in_sum = param_.x->numel();
+    if (dims.size() == 0 || dims.size() == 1) {
+      ch->macs = 1.f * in_sum;
+    } else if (dims.size() == 2) {
+      ch->macs = 2.f * in_sum;
+    } else {
+      LOG(FATAL) << "This dims size of ReduceProd: " << dims.size()
+                 << " doesn't support";
+      ch->macs = 0.f;
+    }
+  }
+#endif
+
 private:
  mutable ReduceParam param_;
 };

--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
@@ -18,6 +18,9 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/scope.h"
 #include "lite/utils/all.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/api/paddle_place.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -35,8 +38,61 @@ class ReluOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "relu"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.X->dims();
+    auto output_dims = param_.Out->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = ActivationTypeToStr(param_.active_type);
+    switch (param_.active_type) {
+      case lite_api::ActivationType::kRelu:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kRelu6:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kPRelu:
+        ch->macs = param_.X->numel() * 2.0;
+        break;
+      case lite_api::ActivationType::kSwish:
+        ch->macs = param_.X->numel() * 4.0;
+        break;
+      case lite_api::ActivationType::kSigmoid:
+        ch->macs = param_.X->numel() * 3.0;
+        break;
+      case lite_api::ActivationType::kTanh:
+        ch->macs = param_.X->numel() * 5.0;
+        break;
+      case lite_api::ActivationType::kExp:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kAbs:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kHardSwish:
+        ch->macs = param_.X->numel() * 5.0;
+        break;
+      case lite_api::ActivationType::kReciprocal:
+        ch->macs = param_.X->numel();
+        break;
+      case lite_api::ActivationType::kIndentity:
+        break;
+      default:
+        LOG(FATAL) << "This Type of Activation:"
+                   << static_cast<int>(param_.active_type)
+                   << ActivationTypeToStr(param_.active_type)
+                   << " doesn't support";
+    }
+  }
+#endif
+
 private:
  mutable ActivationParam param_;
 };

--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
@@ -35,8 +35,19 @@ class ScaleOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "scale"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->output_shape = ch->DimToStr(param_.output->dims());
+    ch->remark =
+        param_.activation_type + "alpha" + std::to_string(param_.alpha);
+    ch->macs = param_.x->numel() * 1.f;
+  }
+#endif
+
 private:
  mutable ScaleParam param_;
 };

--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
@@ -27,17 +27,48 @@ class SearchAlignedMatMulOpLite : public OpLite {
 public:
  SearchAlignedMatMulOpLite() {}

-  explicit SearchAlignedMatMulOpLite(const std::string &type) : OpLite(type) {}
+  explicit SearchAlignedMatMulOpLite(const std::string& type) : OpLite(type) {}

  bool CheckShape() const override;

  bool InferShapeImpl() const override;

-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override;

-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
  std::string DebugString() const override { return "search_aligned_mat_mul"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->filter_shape = ch->DimToStr(param_.Y->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "alpha" + std::to_string(param_.alpha) + "trans_x" +
+                 std::to_string(param_.transpose_X) + "trans_y" +
+                 std::to_string(param_.transpose_Y);
+
+    const auto x_dims = param_.X->dims();
+    const auto y_dims = param_.Y->dims();
+    const auto& x_lod = param_.X->lod();
+    const auto& y_lod = param_.Y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = param_.transpose_X ? x_inner_size : x_batch_size;
+    int N = param_.transpose_Y ? y_batch_size : y_inner_size;
+    int X_K = param_.transpose_X ? x_batch_size : x_inner_size;
+    int Y_K = param_.transpose_Y ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+    ch->macs = 2.0 * M * N * K;
+  }
+#endif
+
 private:
  mutable MatMulParam param_;
 };

--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
@@ -35,8 +35,21 @@ class SearchFcOpLite : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "search_fc"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->filter_shape = ch->DimToStr(param_.W->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->remark = "out_size" + std::to_string(param_.out_size);
+    auto x_dims = param_.X->dims();
+    auto w_dims = param_.W->dims();
+    ch->macs = 2.f * x_dims[0] * x_dims[1] * w_dims[0];
+  }
+#endif
+
 private:
  mutable SearchFcParam param_;
 };

--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
@@ -36,8 +36,21 @@ class SearchSeqFcOpLite : public OpLite {
  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }

  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
  std::string DebugString() const override { return "search_seq_fc"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->filter_shape = ch->DimToStr(param_.w->dims());
+    ch->output_shape = ch->DimToStr(param_.out->dims());
+    ch->remark = "out_size" + std::to_string(param_.out_size);
+    auto x_dims = param_.x->dims();
+    auto w_dims = param_.w->dims();
+    ch->macs = 2.f * x_dims[0] * x_dims[1] * w_dims[0];
+  }
+#endif
+
 private:
  mutable SearchSeqFcParam param_;
 };

--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
@@ -36,8 +36,20 @@ class SearchSeqSoftmaxOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "search_seq_softmax_op"; }

+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    auto input_dims = param_.x->dims();
+    auto output_dims = param_.output->dims();
+    ch->input_shape = ch->DimToStr(input_dims);
+    ch->output_shape = ch->DimToStr(output_dims);
+    ch->remark = "axis" + std::to_string(param_.axis);
+    ch->macs = 4.f * param_.x->numel();
+  }
+#endif
+
 private:
  mutable SoftmaxParam param_;
 };

--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -38,10 +38,17 @@ static bool IsFileExists(const std::string& path) {
 // ARM mobile not support mkdir in C++
 static void MkDirRecur(const std::string& path) {
 #ifndef LITE_WITH_ARM
+
+#ifdef _WIN32
+  if (system(string_format("md %s", path.c_str()).c_str()) != 0) {
+    LOG(ERROR) << "Cann't mkdir " << path;
+  }
+#else
  if (system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
    LOG(ERROR) << "Cann't mkdir " << path;
  }
-#else  // On ARM
+#endif  // _WIN32
+#else   // On ARM
  CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path;
 #endif
 }