diff --git a/.gitignore b/.gitignore
index ed131bdbbad6bd4dad500fa29f40a29fddeb7593..9823f8c945c1be8e717b622a993d402c49517b7c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+
+build*
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 82dd60f4b4391c89e09533418874e0d6d8174d84..3db715ba74945d9e501637af5ef3086e4f11b294 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -57,22 +57,20 @@ function(check_linker_flag)
     endforeach()
     set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
 endfunction()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (LITE_ON_TINY_PUBLISH)
     if((NOT LITE_WITH_PYTHON))
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
     endif()
+    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+    endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
     check_linker_flag(-Wl,--gc-sections)
 endif()
 
-if(LITE_WITH_OPENCL)
-    if(ARM_TARGET_LANG STREQUAL "clang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-endif()
-
 if(LITE_WITH_OPENMP)
     find_package(OpenMP REQUIRED)
     if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index d69232a0d95518217fba9eb3a7e15f7441695778..d16e7af3d7a61fff0ef13cf7cfcbd7af542e7c3f 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
 
 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+
+# file to record faked kernels for opt python lib
+set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
+file(WRITE ${fake_kernels_src_list} "") # clean
+
 if(LITE_BUILD_TAILOR)
   set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
@@ -313,56 +318,65 @@ function(add_kernel TARGET device level)
         return()
     endif()
 
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
-    endif()
-
-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-
 
     if ("${device}" STREQUAL "Host")
         set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "ARM")
         if (NOT LITE_WITH_ARM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "X86")
         if (NOT LITE_WITH_X86)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "NPU")
         if (NOT LITE_WITH_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "XPU")
         if (NOT LITE_WITH_XPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "FPGA")
         if (NOT LITE_WITH_FPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "BM")
         if (NOT LITE_WITH_BM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
@@ -375,6 +389,9 @@ function(add_kernel TARGET device level)
     endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
@@ -382,6 +399,9 @@ function(add_kernel TARGET device level)
 
     if ("${device}" STREQUAL "CUDA")
         if (NOT LITE_WITH_CUDA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md
index 36bf8831f142b1bd6c988b0ece7192437643fcbf..3cf1486307ad79a47dfbfe199e3d6d708c99db4b 100644
--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
 > 不同手机，不同版本，测试模型的性能数据不同。
 
 ```shell
-run benchmark armv7
+run benchmark armv8
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
--- mnasnet               avg = 159.8427 ms
--- mobilenet_v1          avg = 235.0072 ms
--- mobilenet_v2          avg = 173.0387 ms
--- shufflenet_v2         avg = 76.0040 ms
--- squeezenet_v11        avg = 164.2957 ms
+mnasnet                       min = 19.83500    max = 19.38500    average = 19.65503
+mobilenetv1                   min = 32.00600    max = 31.56900    average = 31.81983
+mobilenetv2                   min = 22.37900    max = 22.08700    average = 22.28623
+shufflenetv2                  min = 10.80400    max = 10.62900    average = 10.68890
+squeezenet                    min = 17.67400    max = 17.47900    average = 17.57677
 
 Threads=2 Warmup=10 Repeats=30
--- mnasnet               avg = 83.1287 ms
--- mobilenet_v1          avg = 121.6029 ms
--- mobilenet_v2          avg = 86.6175 ms
--- shufflenet_v2         avg = 41.5761 ms
--- squeezenet_v11        avg = 87.8678 ms
+mnasnet                       min = 11.85600    max = 11.72000    average = 11.77127
+mobilenetv1                   min = 18.75000    max = 18.64300    average = 18.70593
+mobilenetv2                   min = 14.05100    max = 13.59900    average = 13.71450
+shufflenetv2                  min = 6.67200     max = 6.58300     average = 6.63400
+squeezenet                    min = 12.07100    max = 11.33400    average = 11.41253
 
 Threads=4 Warmup=10 Repeats=30
--- mnasnet               avg = 73.3880 ms
--- mobilenet_v1          avg = 119.0739 ms
--- mobilenet_v2          avg = 85.3050 ms
--- shufflenet_v2         avg = 38.0762 ms
--- squeezenet_v11        avg = 64.2201 ms
+mnasnet                       min = 7.19300     max = 7.02600     average = 7.08480
+mobilenetv1                   min = 10.42000    max = 10.29100    average = 10.34267
+mobilenetv2                   min = 8.61900     max = 8.46900     average = 8.54707
+shufflenetv2                  min = 4.55200     max = 4.41900     average = 4.46477
+squeezenet                    min = 8.60000     max = 7.85200     average = 7.98407
 --------------------------------------
 
-run benchmark armv8
+run benchmark armv7
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
--- mnasnet               avg = 165.3073 ms
--- mobilenet_v1          avg = 306.0188 ms
--- mobilenet_v2          avg = 195.1884 ms
--- shufflenet_v2         avg = 99.3692 ms
--- squeezenet_v11        avg = 156.6971 ms
+mnasnet                       min = 20.98300    max = 20.81400    average = 20.92527
+mobilenetv1                   min = 33.19000    max = 32.81700    average = 33.08490
+mobilenetv2                   min = 25.91400    max = 25.61700    average = 25.73097
+shufflenetv2                  min = 11.14300    max = 10.97600    average = 11.06757
+squeezenet                    min = 19.31800    max = 19.20000    average = 19.26530
 
 Threads=2 Warmup=10 Repeats=30
--- mnasnet               avg = 90.2290 ms
--- mobilenet_v1          avg = 157.0007 ms
--- mobilenet_v2          avg = 118.1607 ms
--- shufflenet_v2         avg = 68.6804 ms
--- squeezenet_v11        avg = 91.3090 ms
+mnasnet                       min = 12.59900    max = 12.46600    average = 12.52207
+mobilenetv1                   min = 19.05800    max = 18.94700    average = 18.97897
+mobilenetv2                   min = 15.28400    max = 15.11300    average = 15.19843
+shufflenetv2                  min = 6.97000     max = 6.81400     average = 6.90863
+squeezenet                    min = 12.87900    max = 12.12900    average = 12.22530
 
 Threads=4 Warmup=10 Repeats=30
--- mnasnet               avg = 179.9730 ms
--- mobilenet_v1          avg = 204.0684 ms
--- mobilenet_v2          avg = 181.6486 ms
--- shufflenet_v2         avg = 123.2728 ms
--- squeezenet_v11        avg = 412.9046 ms
+mnasnet                       min = 7.31400     max = 7.12900     average = 7.20357
+mobilenetv1                   min = 11.44000    max = 10.86900    average = 10.94383
+mobilenetv2                   min = 9.14900     max = 9.03800     average = 9.09907
+shufflenetv2                  min = 4.60600     max = 4.49400     average = 4.53360
+squeezenet                    min = 8.27000     max = 8.10600     average = 8.19000
 --------------------------------------
 ```
diff --git a/docs/demo_guides/npu.md b/docs/demo_guides/npu.md
index 9722ff6aabda87cb02adc111dd1b29e9bdcf3f55..0bdec8d73a881c186d9c4141e2d59a1b2bf11d8b 100644
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
     --optimize_out_type=(protobuf|naive_buffer) \
     --optimize_out=<output_optimize_model_dir> \
     --valid_targets=npu,arm \
-    --prefer_int8_kernel=(true|false) \
     --record_tailoring_info =(true|false)
 ```
 - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md
index 325a772df31ce3873941f74e8a4ed1069e0b3da2..e255038575796f0c1079f47fb859f8402ac79c1f 100644
--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h
   --arm_os=android \
   --arm_abi=armv8 \
   --arm_lang=gcc \
-  build_test_arm_opencl
+  build_opencl
 ```
 
+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
+
 ### 编译产物说明
 
 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md
index e32500bd5851ddad0de3784fb47a7b6326aff6f4..cf0641b7314f112e9cb7ac4f0a9094bdbdaa7ca6 100644
--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 例如：
 
 ```bash
-./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
 ```
 **注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
 
@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 #include <stdio.h>
 #include <vector>
 #include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
 2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
-4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
+4. 需要使用Paddle-Lite  `release/v2.1.0`之后版本代码编译出的模型优化工具。
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
index 47f663dc75cdcf0950c87bfe45a78e65604ccbaf..c3d5f527048519e851cc8b9e785dc39668e971a4 100644
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式：
     --optimize_out_type=(protobuf|naive_buffer) \
     --optimize_out=<output_optimize_model_dir> \
     --valid_targets=(arm|opencl|x86|npu|xpu) \
-    --prefer_int8_kernel=(true|false) \
     --record_tailoring_info =(true|false)
 ```
 
@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式：
 | --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
 | --optimize_out      | 优化模型的输出路径。                                         |
 | --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
-| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
 | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
 
 * 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
 * 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
 * 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。
 
 ### 功能二：统计模型算子信息、判断是否支持
 
diff --git a/docs/user_guides/model_quantization.md b/docs/user_guides/model_quantization.md
index d90fa4bae34cccbcf809bdb2cd102eaf8c468b01..cf506cfa61e3942452ddaf1218d9d55c2fffa3fc 100644
--- a/docs/user_guides/model_quantization.md
+++ b/docs/user_guides/model_quantization.md
@@ -245,7 +245,6 @@ python compress.py \
 --optimize_out_type=naive_buffer              \
 --optimize_out=mobilenet_v1_quant_opt         \
 --valid_targets=arm                           \
---prefer_int8_kernel=true
 ```
 
 如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md
index 206045822b896e07fca2651768b32c89c7615cb2..4068249ff7544f42c5f2643c971eb003836b1f59 100644
--- a/docs/user_guides/post_quant_no_data.md
+++ b/docs/user_guides/post_quant_no_data.md
@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
 
 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
-因为该模型会将量化的权重反量化，然后实际加载并执行FP32预测模型，所以opt命令的输入参数--prefer_int8_kernel不需要设置为true，同时其他参数按照实际情况参考文档设置。
 比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \
diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md
index 8b293cc7e47a33037de3706a30fd583c5516d165..0044b47610a2a211859bdc42f83f1921a681d50b 100644
--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'):
 
 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
 
-参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true，其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
+参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具，参数按照实际情况设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \
       --optimize_out_type=naive_buffer \
       --optimize_out=mobilenet_v1_quant_opt \
-      --valid_targets=arm \
-      --prefer_int8_kernel=true
+      --valid_targets=arm
 ```
 
 ### 3.2 量化模型预测
diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md
index 6bb71938cab16a92e1c33e3d8276872fbcea580a..8f8aeb6af124bc4805c281e22e39cca51b507651 100644
--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -24,8 +24,7 @@ $ ./opt \
     --param_file=<param_path> \
     --optimize_out_type=(protobuf|naive_buffer) \
     --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86) \
-    --prefer_int8_kernel=(ture|false)
+    --valid_targets=(arm|opencl|x86)
 ```
 
 其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 361a518c280150d167ace0c737a2822665b73ff9..12dd17c5a302259fb8f903735115106526716194 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 if (LITE_WITH_PYTHON)
     add_custom_target(publish_inference_python_lib ${TARGET}
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    add_custom_target(publish_inference_python_installer ${TARGET}
+        COMMAND python setup.py bdist_wheel
+        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+        DEPENDS publish_inference_python_lib)
     add_custom_target(publish_inference_python_light_demo ${TARGET}
     	COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
     	COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON)
     endif()
     add_dependencies(publish_inference_python_lib lite_pybind)
     add_dependencies(publish_inference publish_inference_python_lib)
+    add_dependencies(publish_inference publish_inference_python_installer)
     add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
 
@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                     add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                 COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                 endif()
             endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 8425d7923447d4245c0895a3f2e9409bfeaecd79..2a93331f4ac179cc35acb65bd9271c68a93d71ad 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH)
     return()
 endif()
 
+
+# add library for opt_base
+lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
+
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
     lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index b739c78f7c883d62b39d88ae1a7f4bf76ae8932c..556a9e0af01854ff5c57a14dade72b81ed255964 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
   inner_places.emplace_back(
       TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
+  const std::vector<std::string> quant_dequant_op = {
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max",
+      "fake_dequantize_max_abs",
+      "fake_channel_wise_dequantize_max_abs"};
+  bool is_quantized_model = false;
+  for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
+       ++i) {
+    auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
+      auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
+      std::string op_type = op_desc->Type();
+      if (std::find(quant_dequant_op.begin(),
+                    quant_dequant_op.end(),
+                    op_type) != quant_dequant_op.end()) {
+        is_quantized_model = true;
+      }
+    }
+  }
+  if (is_quantized_model) {
+    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+
   Program program(desc, scope_, inner_places);
 
   core::KernelPickFactor factor;
diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc
index 8c929e9c8700a65c868e2facd763b0ec36719e23..eec17cc30e308e7169b7d8c394c0e47eee0c1c3e 100644
--- a/lite/api/cxx_api_bin.cc
+++ b/lite/api/cxx_api_bin.cc
@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
 
 int main(int argc, char** argv) {
   CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  paddle::lite::Run(argv[1], atoi(argv[2]));
 
   return 0;
 }
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 3965843250abe45c43490bdbb4aaed58915e0908..cdf5b7fb06df35b2e7fb72fc4e33ccb721a0f7f7 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
 
 std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
   LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
+  return nullptr;
 }
 
 std::string LightPredictorImpl::GetVersion() const { return lite::version(); }
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index 9164129dcf4566fc02803c1c7dcffd9e97a830d6..5342a36ec154b2bdde44fa72bc21e9d430ad4efe 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,
 
   if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
     ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
     for (int i = 0; i < ref.size(); ++i) {
       for (int j = 0; j < ref[i].size(); ++j) {
         auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places,
 
   // Get detailed result
   size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;
 
   for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
     auto* output_tensor = predictor.GetOutput(tidx);
     VLOG(1) << "============= output tensor " << tidx << " =============\n";
     auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
-
-    float sum = 0.f;
-    for (int i = 0; i < out_dims.production(); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
-    VLOG(1) << "out_dims.production():" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;
 
     // print result
     for (int i = 0; i < out_dims.production(); ++i) {
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 26b9dc93da73e8f637c01fca8f7ea99a8e5e9af0..465f82056c6bb80b706cfb7d875773d75735911b 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
 
   if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
     ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
     for (int i = 0; i < ref.size(); ++i) {
       for (int j = 0; j < ref[i].size(); ++j) {
         auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places,
 
   // Get detailed result
   size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;
 
   for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
     auto* output_tensor = predictor.GetOutput(tidx);
     VLOG(1) << "============= output tensor " << tidx << " =============\n";
     auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
-
-    float sum = 0.f;
-    for (int i = 0; i < out_dims.production(); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
-    VLOG(1) << "out_dims.production():" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;
 
     // print result
     for (int i = 0; i < out_dims.production(); ++i) {
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index ed4ab75366a0ab669fb8fe6e1d15ad9fd2f5aef5..b0f7a0479f0db91b816838f9d0ee1cc31b9b232a 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     std::ofstream out(FLAGS_arg_name + ".txt");
     for (size_t i = 0; i < arg_num; ++i) {
       sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
     }
     LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
               << ", mean value is " << sum * 1. / arg_num;
diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc
index bea0ab15e49dc55e0a8f5f29d455b5504345cf19..375d249476bf5323d69ea41c3f11d07e9c8bc711 100644
--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     std::ofstream out(FLAGS_arg_name + ".txt");
     for (size_t i = 0; i < arg_num; ++i) {
       sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
     }
     LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
               << ", mean value is " << sum * 1. / arg_num;
diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc
index 36a23999d33b38d8c54f604850bf5d4120ce3d72..f9be12b2c78c623a2b2c9852850576cc11815bd3 100644
--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     std::ofstream out(FLAGS_arg_name + ".txt");
     for (size_t i = 0; i < arg_num; ++i) {
       sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
     }
     LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
               << ", mean value is " << sum * 1. / arg_num;
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index 0b995fa8abde5850acefed8dee384b9206258f6a..51f9b565196d30520f0cf73ea41a01fed0cc49e8 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
               "arm",
               "The targets this model optimized for, should be one of (arm, "
               "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
 DEFINE_bool(print_supported_ops,
             false,
             "Print supported operators on the inputed target");
@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() {
       << "At least one target should be set, should set the "
          "command argument 'valid_targets'";
 
-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.insert(valid_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
   return valid_places;
 }
 
@@ -257,7 +251,6 @@ void PrintHelpInfo() {
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
       "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
-      "        `--prefer_int8_kernel=(true|false)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd86f486248a2daccde13da078ae3860d8e31169
--- /dev/null
+++ b/lite/api/opt_base.cc
@@ -0,0 +1,364 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/api/opt_base.h"
+#include "all_kernel_faked.cc"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+void OptBase::SetModelDir(const std::string& model_path) {
+  opt_config_.set_model_dir(model_path);
+}
+
+void OptBase::SetModelFile(const std::string& model_path) {
+  opt_config_.set_model_file(model_path);
+}
+
+void OptBase::SetParamFile(const std::string& param_path) {
+  opt_config_.set_param_file(param_path);
+}
+
+void OptBase::SetModelType(std::string optimize_out_type) {
+  if (optimize_out_type == "protobuf") {
+    model_type_ = LiteModelType::kProtobuf;
+  } else if (optimize_out_type == "naive_buffer") {
+    model_type_ = LiteModelType::kNaiveBuffer;
+  } else {
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
+  }
+}
+
+void OptBase::SetValidPlaces(const std::string& valid_places) {
+  valid_places_.clear();
+  auto target_reprs = lite::Split(valid_places, ",");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places_.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
+    } else if (target_repr == "x86") {
+      valid_places_.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places_.emplace_back(TARGET(kXPU));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+  CHECK(!valid_places_.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+}
+
+void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+  optimize_out_path_ = optimized_out_path;
+}
+
+void OptBase::RunOptimize(bool record_strip_info) {
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+    auto resulted_model_name =
+        record_strip_info ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << optimize_out_path_ << "successfully";
+  }
+}
+
+// collect ops info of modelset
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void OptBase::SetModelSetDir(const std::string& model_set_path) {
+  model_set_dir_ = model_set_path;
+}
+void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
+  // 1. mkdir of outputed optimized model set.
+  lite::MkDirRecur(optimize_out_path_);
+  auto model_dirs = lite::ListDir(model_set_dir_, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
+  }
+
+  // 2. optimize each model in inputed model set dir.
+  std::string model_file = opt_config_.model_file();
+  std::string param_file = opt_config_.param_file();
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({model_set_dir_, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({optimize_out_path_, name}, "/");
+
+    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
+      auto model_file_path =
+          lite::Join<std::string>({input_model_dir, model_file}, "/");
+      auto param_file_path =
+          lite::Join<std::string>({input_model_dir, param_file}, "/");
+    }
+
+    std::cout << "Start optimize model: " << input_model_dir;
+
+    opt_config_.set_model_dir(input_model_dir);
+    opt_config_.set_model_file(model_file);
+    opt_config_.set_param_file(param_file);
+
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+
+    std::cout << "Optimize done. ";
+  }
+
+  // 3. if record_strip_info = true, we will record striping info
+  if (record_strip_info) {
+    // Collect all models information
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+    CollectModelMetaInfo(optimize_out_path_,
+                         model_dirs,
+                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+    std::cout << "Record the information of stripped models into :"
+              << optimize_out_path_ << "successfully";
+  }
+}
+
+void OptBase::PrintHelpInfo() {
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of help information:\n"
+      "        `help()`   Print help infomation\n"
+      "  Arguments of model optimization:\n"
+      "        `set_model_dir(model_dir)`\n"
+      "        `set_model_file(model_file_path)`\n"
+      "        `set_param_file(param_file_path)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `run_optimize(false|true)`\n"
+      "        `  ----fasle&true refer to whether to record ops info for "
+      "tailoring lib, false by default`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `print_all_ops()`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `print_supported_ops`   Display supported operators of valid "
+      "places\n"
+      "        `check_if_model_supported()`   Check if the input model is "
+      "supported\n";
+
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+}
+// 2. Print supported info of inputed ops
+void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
+  std::vector<std::string> lite_supported_targets = {"kHost",
+                                                     "kX86",
+                                                     "kCUDA",
+                                                     "kARM",
+                                                     "kOpenCL",
+                                                     "kFPGA",
+                                                     "kNPU",
+                                                     "kXPU",
+                                                     "kAny",
+                                                     "kUnk"};
+  // Get the lengh of the first column: maximum length of the op_type
+  size_t maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  // Print the first row: OP_nam taget1 target2 ...
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+    std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  // Print the name of supported ops and mark if it's supported by each target
+  // print the support info of inputed ops: valid_ops
+  for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+    std::cout << std::setw(maximum_optype_length) << *op;
+    // Check: If this kernel doesn't match any operator, we will skip it.
+    if (supported_ops.find(*op) == supported_ops.end()) {
+      continue;
+    }
+    // Print OP info.
+    auto ops_valid_places = supported_ops.at(*op);
+    for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+      if (std::find(ops_valid_places.begin(),
+                    ops_valid_places.end(),
+                    lite_supported_targets[i]) != ops_valid_places.end()) {
+        std::cout << std::setw(10) << "Y";
+      } else {
+        std::cout << std::setw(10) << " ";
+      }
+    }
+    std::cout << std::endl;
+  }
+}
+
+void OptBase::DisplayKernelsInfo() {  // Display kernel information
+  std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
+}
+void OptBase::PrintAllOps() {
+  // 1. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < supported_ops_target.size(); i++) {
+    auto ops = supported_ops_target[i];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 2. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+void OptBase::PrintSupportedOps() {
+  // 1. Get the valid hardware targets
+  std::vector<TargetType> target_types = {};
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    target_types.push_back(valid_places_[i].target);
+  }
+  std::string targets_str = TargetToStr(target_types[0]);
+  for (size_t i = 1; i < target_types.size(); i++) {
+    targets_str = targets_str + TargetToStr(target_types[i]);
+  }
+  std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+  target_types.push_back(TARGET(kHost));
+  target_types.push_back(TARGET(kUnk));
+
+  // 2. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < target_types.size(); i++) {
+    auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 3. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+// test whether this model is supported
+void OptBase::CheckIfModelSupported(bool print_ops_info) {
+  // 1. parse valid places and valid targets
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    auto target = valid_places_[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+
+  // 2.Load model into program to get ops in model
+  std::string prog_path = opt_config_.model_dir() + "/__model__";
+  if (!(opt_config_.model_file()).empty() &&
+      !(opt_config_.param_file()).empty()) {
+    prog_path = opt_config_.model_file();
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (print_ops_info) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (size_t i = 0; i < valid_places_.size(); i++) {
+      targets.push_back(valid_places_[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (size_t i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (print_ops_info) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248
--- /dev/null
+++ b/lite/api/opt_base.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines Opt and basic functions about model transformation.
+ */
+
+#ifndef PADDLE_LITE_OPT_H_  // NOLINT
+#define PADDLE_LITE_OPT_H_
+#include <algorithm>
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+// stores the map that records the source_file path of each kernel.
+#include "kernel_src_map.h"  // NOLINT
+#include "lite/api/cxx_api.h"
+// version of Paddle-lite
+#include "lite/core/version.h"
+// model parser functions to pre-load model to verify if this model is supported
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
+#include "lite/utils/string.h"
+// recorded all the ops supported by paddle-lite
+#include "supported_kernel_op_info.h"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+/// The PaddlePredictor defines the basic interfaces for different kinds of
+/// predictors.
+class LITE_API OptBase {
+ public:
+  OptBase() = default;
+  void SetModelSetDir(const std::string &model_set_path);
+  void SetModelDir(const std::string &model_path);
+  void SetModelFile(const std::string &model_path);
+  void SetParamFile(const std::string &param_path);
+  void SetValidPlaces(const std::string &valid_places);
+  void SetOptimizeOut(const std::string &optimized_out_path);
+  // set optimized_model type
+  void SetModelType(std::string model_type);
+  // transform and save the optimized model
+  void RunOptimize(bool record_strip_info = false);
+
+  // fuctions of printing info
+  // 1. help info
+  void PrintHelpInfo();
+  // 2. PrintOpsInfo
+  void PrintOpsInfo(const std::set<std::string> &valid_ops =
+                        {});  // print supported ops on target_types
+  void PrintAllOps();         // print all ops
+  void PrintSupportedOps();   // print ops supported on valid_places_
+  void DisplayKernelsInfo();  // Display kernel information
+  // 3. Check if this model is supported
+  void CheckIfModelSupported(bool print_ops_info = true);
+
+ private:
+  CxxConfig opt_config_;
+  // valid places for the optimized_model
+  std::vector<Place> valid_places_;
+  // filename of the optimized_model
+  std::string optimize_out_path_;
+  // type of the optimized_model, kNaiveBuffer default.
+  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
+  // Dir path of a set of models, this should be combined with model
+  std::string model_set_dir_;
+
+  void RunOptimizeFromModelSet(bool record_strip_info = false);
+};
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#endif  // NOLINT
diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt
index 43178a37c663bb09acb7c025e021cbc91bf0cc5d..ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8 100644
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
     return()
 endif()
 
+# to create setup.py for packeting whl for Paddle-Lite and opt
+
+execute_process(
+  COMMAND git describe --tags --exact-match
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_TAG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_subdirectory(pybind)
 #add_subdirectory(interface)
diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/lite/api/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index eabb6b150b93a722282118c3932676cd1aee5da8..b1de18d50c1582b0f872ad38d24939665ab1d3b0 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
 if (NOT LITE_ON_TINY_PUBLISH)
-   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
+   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
 
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 40b6db6d42c3a065ec09d535f0e9da22e8fa0399..942d7f8b540a6ff7ae6d62e98e6e573e1af12aa8 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -26,11 +26,12 @@
 
 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
+#include "lite/api/opt_base.h"
 #endif
 
 #include "lite/api/light_api.h"
 #include "lite/api/paddle_api.h"
+#include "lite/core/tensor.h"
 
 namespace py = pybind11;
 
@@ -48,10 +49,27 @@ using lite_api::DataLayoutType;
 using lite_api::Place;
 using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
+using lite_api::OptBase;
 
 #ifndef LITE_ON_TINY_PUBLISH
 using lite::CxxPaddleApiImpl;
 static void BindLiteCxxPredictor(py::module *m);
+void BindLiteOpt(py::module *m) {
+  py::class_<OptBase> opt_base(*m, "Opt");
+  opt_base.def(py::init<>())
+      .def("set_model_dir", &OptBase::SetModelDir)
+      .def("set_modelset_dir", &OptBase::SetModelSetDir)
+      .def("set_model_file", &OptBase::SetModelFile)
+      .def("set_param_file", &OptBase::SetParamFile)
+      .def("set_valid_places", &OptBase::SetValidPlaces)
+      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_model_type", &OptBase::SetModelType)
+      .def("run_optimize", &OptBase::RunOptimize)
+      .def("help", &OptBase::PrintHelpInfo)
+      .def("print_supported_ops", &OptBase::PrintSupportedOps)
+      .def("display_kernels_info", &OptBase::DisplayKernelsInfo)
+      .def("print_all_ops", &OptBase::PrintAllOps);
+}
 #endif
 static void BindLiteLightPredictor(py::module *m);
 static void BindLiteCxxConfig(py::module *m);
diff --git a/lite/api/python/pybind/pybind.h b/lite/api/python/pybind/pybind.h
index ca05f24b32fd0b0418d9cf595fe6134b34fa725f..15609957e05391be54466262f962e151594ef383 100644
--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
@@ -22,11 +22,15 @@ namespace lite {
 namespace pybind {
 
 void BindLiteApi(pybind11::module *m);
+void BindLiteOpt(pybind11::module *m);
 
-PYBIND11_MODULE(lite_core, m) {
+PYBIND11_MODULE(lite, m) {
   m.doc() = "C++ core of Paddle-Lite";
 
   BindLiteApi(&m);
+#ifndef LITE_ON_TINY_PUBLISH
+  BindLiteOpt(&m);
+#endif
 }
 
 }  // namespace pybind
diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..79028fb7493bf55eab74aa76ee51ac79f418ba0a
--- /dev/null
+++ b/lite/api/python/setup.py.in
@@ -0,0 +1,72 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+
+import shutil
+import os
+from setuptools import setup, Distribution
+
+
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+
+
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+
+# link lite.so to paddlelite.libs
+COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h
index 71752c942bb53e7f2ed289ac0d965ae1d1007c55..a17fc331310cfe17ec36be504b94ddacc724e90f 100644
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -17,6 +17,7 @@
 #include <gflags/gflags.h>
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 
 // for eval
 DEFINE_string(model_dir, "", "model dir");
@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 186ad19735799dcb91641354af4b4f09692bfce9..47a4d427f5400212a80fc31336e462a1c48bd640 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_add_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 int num) {
+  int cnt = num >> 4;
+  int remain = num & 0x0f;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const float* out_data = dout_grad + 16 * i;
+    float* x_data = x_grad + 16 * i;
+    float32x4_t din0 = vld1q_f32(out_data);
+    float32x4_t din1 = vld1q_f32(out_data + 4);
+    float32x4_t din2 = vld1q_f32(out_data + 8);
+    float32x4_t din3 = vld1q_f32(out_data + 12);
+    vst1q_f32(x_data, din0);
+    vst1q_f32(x_data + 4, din1);
+    vst1q_f32(x_data + 8, din2);
+    vst1q_f32(x_data + 12, din3);
+  }
+  if (remain > 0) {
+    const float* out_data = dout_grad + 16 * cnt;
+    float* x_data = x_grad + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      x_data[i] = out_data[i];
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_add_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post >> 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum += out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 template <>
 void elementwise_sub<float>(const float* dinx,
                             const float* diny,
@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
     }
   }
 }
+// we assume the formula is x-y
+template <>
+void elementwise_sub_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 float* y_grad,
+                                 int num) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, num);
+  }
+  if (y_grad) {
+    int cnt = num >> 4;
+    int remain = num & 0x0f;
+    float32x4_t minus = vdupq_n_f32(-1);
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      const float* out_data = dout_grad + 16 * i;
+      float* y_data = y_grad + 16 * i;
+      float32x4_t din0 = vld1q_f32(out_data);
+      float32x4_t din1 = vld1q_f32(out_data + 4);
+      float32x4_t din2 = vld1q_f32(out_data + 8);
+      float32x4_t din3 = vld1q_f32(out_data + 12);
+      din0 = vmulq_f32(din0, minus);
+      din1 = vmulq_f32(din1, minus);
+      din2 = vmulq_f32(din2, minus);
+      din3 = vmulq_f32(din3, minus);
+      vst1q_f32(y_data, din0);
+      vst1q_f32(y_data + 4, din1);
+      vst1q_f32(y_data + 8, din2);
+      vst1q_f32(y_data + 12, din3);
+    }
+    if (remain > 0) {
+      const float* out_data = dout_grad + 16 * cnt;
+      float* y_data = y_grad + 16 * cnt;
+      for (int i = 0; i < remain; ++i) {
+        y_data[i] = -out_data[i];
+      }
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post << 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum -= out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 
 template <>
 void elementwise_mul<float>(const float* dinx,
diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h
index 60d702742dec58f1502837617f5d4059dbb43e22..06ecab08edcaf06614de94b99084be2ee80647aa 100644
--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -183,6 +183,13 @@ template <typename T>
 void elementwise_add_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_add_grad(const T* dout, T* dinx, int num);
+
+template <typename T>
+void elementwise_add_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
 
@@ -197,6 +204,13 @@ template <typename T>
 void elementwise_sub_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
+
+template <typename T>
+void elementwise_sub_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);
 
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index d17ce0dea4640899482ba9dd87d0646ca2de705d..a7d4322326c9413878264400ba8118b510fade10 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
   "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
   "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
   "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
   "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
+  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
   "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
+  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
+  /*"vmla.f32 q0, q4, q6            @ mul add\n" */                            \
+  /*"vmla.f32 q1, q4, q8            @ mul add\n" */                            \
   "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
   "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
   "subs %[cnt], #1                @ sub loop count \n"                         \
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..83b8dff70eb8de7cf1d117585d47118fed539a15 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
 
 inline void save_float(float* data, const std::string& name, int len) {
   static int counter = 0;
-  std::string old_string = std::to_string(counter);
+  std::string old_string = paddle::lite::to_string(counter);
   std::string new_string =
       std::string(3 - old_string.length(), '0') + old_string;
 
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 988bc1bb507036de8f13a6c6549c549718bd1256..12a60bd27da832b338dc6b1ca11b1c7d6aa192e4 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -351,10 +351,10 @@ class Tensor {
   void printScale(std::string type) { printScale(); }
 
   std::string dimsFileName() {
-    return std::to_string(shape_->num()) + "_" +
-           std::to_string(shape_->channel()) + "_" +
-           std::to_string(shape_->height()) + "_" +
-           std::to_string(shape_->width()) + ".txt";
+    return paddle::lite::to_string(shape_->num()) + "_" +
+           paddle::lite::to_string(shape_->channel()) + "_" +
+           paddle::lite::to_string(shape_->height()) + "_" +
+           paddle::lite::to_string(shape_->width()) + ".txt";
   }
 
   void saveToFile() { std::string path = dimsFileName(); }
@@ -374,7 +374,7 @@ class Tensor {
     invalidate();
     std::ofstream ofs;
     static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
+    std::string npath = paddle::lite::to_string(counter) + "_" + path;
     counter++;
     save_file_with_name(npath);
   }
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..345b239c320f04eba8426483a23a352e77a71036 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace lite {
 namespace npu {
 
-std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
-    std::string& model_name,                 // NOLINT
+std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
+    const std::string model_name,            // NOLINT
     std::vector<ge::Operator>& input_nodes,  // NOLINT
     std::vector<ge::Operator>& output_nodes  // NOLINT
     ) {
@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
+
   // Create a HiAI model manager client to load the HiAI om model
-  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+  std::shared_ptr<hiai::AiModelMngerClient> model_client(
       new hiai::AiModelMngerClient());
   if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
     LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
-  model_name = "model_" + std::to_string(model_count_++) + ".om";
   auto model_desc = std::make_shared<hiai::AiModelDescription>(
       model_name, freq_level(), framework_type(), model_type(), device_type());
   model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..6733a7f6dfa085d2c64274a81ba2a028ebe88f3f 100644
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -40,8 +40,8 @@ class Device {
 
   // Build the HiAI IR graph to om model, return HiAI model manager client to
   // load om model and run inference.
-  std::unique_ptr<hiai::AiModelMngerClient> Build(
-      std::string& model_name,                 // NOLINT
+  std::shared_ptr<hiai::AiModelMngerClient> Build(
+      const std::string model_name,            // NOLINT
       std::vector<ge::Operator>& input_nodes,  // NOLINT
       std::vector<ge::Operator>& output_nodes  // NOLINT
       );                                       // NOLINT
@@ -51,7 +51,6 @@ class Device {
   int framework_type_{0};
   int model_type_{0};
   int device_type_{0};
-  int model_count_{0};
 };
 
 }  // namespace npu
diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt
index 3b504fbed6a5cef6ab3cff46c0e9b7009459ac80..0ac8cf310370f34ae5743113efe1d71579979daf 100644
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)
 
 add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
similarity index 99%
rename from lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
index d840195dd42c71bab5afda32a11d805f5a96b114..4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -1,6 +1,6 @@
 #include <cl_common.h>
 
-__kernel void conv2d_1x1(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
index 468dd1a8a30ca572d76ed0e20acf59e6906e0e1c..79f3922e89549fc15b7a849efb0e2b6595357102 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -14,21 +14,22 @@ limitations under the License. */
 
 #include <cl_common.h>
 
-__kernel void conv2d_3x3_opt(__private const int item_ch, 
+__kernel void conv2d_3x3_opt(__private const int item_ch,
                              __private const int item_w,
-                             __private const int item_h, 
+                             __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
 #endif
-                             __write_only image2d_t output_image, 
+                             __write_only image2d_t output_image,
                              __private const int stride,
-                             __private const int pad, 
+                             __private const int pad,
                              __private const int dilation,
-                             __private const int in_ch, 
+                             __private const int batch,
+                             __private const int in_ch,
                              __private const int in_w,
-                             __private const int in_h, 
+                             __private const int in_h,
                              __private const int out_w,
                              __private const int out_h) {
 
@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #ifdef BIASE_CH
 
   CL_DTYPE4 output[5];
-  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
   output[1] = output[0];
   output[2] = output[0];
   output[3] = output[0];
@@ -69,23 +71,33 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #elif defined(BIASE_ELE)
 
   CL_DTYPE4 output[5];
-  output[0] =
-      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
   if (out_w_id1 < out_w) {
-    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
   }
   if (out_w_id2 < out_w) {
-    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
   }
   if (out_w_id3 < out_w) {
-    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
   }
   if (out_w_id4 < out_w) {
-    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
   }
 #else
   CL_DTYPE4 output[5] = {0.0f};
@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
     int filter_w_val = ch * 3;
 
     for (int h = 0; h < 3; h++) {
-      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h,
+                            -1,
                             (out_batch_id * in_h + in_h_id + h < 0 ||
                              out_batch_id * in_h + in_h_id + h >= in_h));
 
       for (int w = 0; w < 3; w++) {
-        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
                                (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
                                (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
                                (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
                                (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
                                (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
 
-        filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
             (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
-        filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
             (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
-        filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
             (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
-        filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
             (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
 
-        filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x,
-                                  filter[3].x);  // in_ch:0,out_ch:0-3
-        filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y,
-                                  filter[3].y);  // in_ch:1,out_ch:0-3
-        filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z,
-                                  filter[3].z);  // in_ch:2,out_ch:0-3
-        filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w,
-                                  filter[3].w);  // in_ch:3,out_ch:0-3
-
-        input[0] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
-        input[1] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
-        input[2] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
-        input[3] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
-        input[4] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
 
         output[0] = mad(input[0].x, filter_trans[0], output[0]);
         output[1] = mad(input[1].x, filter_trans[0], output[1]);
@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
   output[3] = activation_type4(output[3]);
   output[4] = activation_type4(output[4]);
 
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
-               output[0]);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
   if (out_w_id1 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
-                 output[1]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
   }
   if (out_w_id2 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
-                 output[2]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
   }
   if (out_w_id3 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
-                 output[3]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
   }
   if (out_w_id4 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
-                 output[4]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
   }
 }
 
+// support batch > 1
+__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..4ed2e072022dc4b457a86d634bf4bc21ab62bc45
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -0,0 +1,516 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of conv5x5
+__kernel void conv2d_5x5_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_5x5_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d82f4b4c96b586b6ecf948827402afd0766dcea4
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -0,0 +1,516 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of con7x7
+__kernel void conv2d_7x7_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_7x7_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h
index 39920195b245e1c44ff68ab91af94d25c949bd02..4317d558c6252e9163bc545cba4859fbcb89f804 100644
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "glog/logging.h"
 #include "lite/backends/x86/jit/gen/jitcode.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
       base += "_Vec";
     }
     base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
+    base += "_D" + paddle::lite::to_string(num_);
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h
index 7cae76f9dd99cf904e831b196bd493623ff7eb1d..999960ece4170d561419ad24bd94c512ce167eb0 100644
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
     } else if (type_ == SeqPoolType::kSqrt) {
       base += "_Sqrt";
     }
-    base += ("_W" + std::to_string(tbl_w_));
+    base += ("_W" + paddle::lite::to_string(tbl_w_));
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h
index b1b302b7904a5d92952f4385c483eccdc5df3592..e7be6750cf0d232b41d3be61001eb0af4c52a129 100644
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {
 
   std::string name() const override {
     std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
-           std::to_string(k_);
+    base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
+           paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h
index 346179cfbbd0e8291dc17b266366c5df07114b7f..60e27993057b58eb8a4a07fcd0a368fc0a9441fc 100644
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
     } else if (type_ == SeqPoolType::kSqrt) {
       base += "_Sqrt";
     }
-    base += ("_W" + std::to_string(w_));
+    base += ("_W" + paddle::lite::to_string(w_));
     return base;
   }
   void genCode() override;
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index e3ee1056d6c0816463bb21e95ea38101fd5d27ba..35aad501070282b49cdd8df72185ad9d21dab9fe 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -94,9 +94,13 @@ add_custom_command(
   OUTPUT ops.h # not a real path to the output to force it execute every time.
   )
 # generate fake kernels for memory_optimize_tool
+
+#-------------------------------opt----------------------------------------------------------------
+# tricks to create headfiles for opt
 add_custom_command(
   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
   ${kernels_src_list}
+  ${fake_kernels_src_list}
   ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
   ${CMAKE_BINARY_DIR}/kernel_src_map.h
   OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
@@ -104,12 +108,12 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
-#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc)
 
 # create headfile to restore ops info sorted by suppported platforms
 add_custom_command(
   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
   ${kernels_src_list}
+  ${fake_kernels_src_list}
   ${ops_src_list}
   ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
   OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
diff --git a/lite/core/context.h b/lite/core/context.h
index 5f711a51434e90d27ca206724bc5b37593e6f70e..cdab4e473bf44c1b5b4ec6c0715ce44074ac63cf 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -490,7 +490,7 @@ class ContextScheduler {
       } break;
 #endif
       default:
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
+#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
         LOG(FATAL) << "unsupported target " << TargetToStr(target);
 #endif
         break;
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index d3e7a625a7a768936db178b3c325e6ac84a0057e..28ec814fa85451b5292bfde6bddc6b64b57b2f08 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) {
     auto attr_type = op_info->GetAttrType(attr_name);
     switch (attr_type) {
       case AttrType::INT:
-        os << ":int:" << std::to_string(op_info->GetAttr<int>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
         break;
       case AttrType::FLOAT:
-        os << ":float:" << std::to_string(op_info->GetAttr<float>(attr_name));
+        os << ":float:"
+           << paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
         break;
       case AttrType::BOOLEAN:
-        os << ":int:" << std::to_string(op_info->GetAttr<bool>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
         break;
       case AttrType::STRING:
         os << ":string: \""
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index ee78fac9a88aa339514778dcc03e2c907487fb39..38293ede76ed35bf05767ce1333947b7dfdbc4ac 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
 
   // non-tensor(like tensor_array) variables will not be reused
   for (auto& node : graph->nodes()) {
-    if (node.IsArg() && !node.arg()->type->IsTensor()) {
+    if (node.IsArg() && (node.arg()->type != nullptr) &&
+        !node.arg()->type->IsTensor()) {
       invalid_var_names.insert(node.arg()->name);
     }
   }
@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
       if (reuse_table.count(name) && reuse_table.at(name) != name) {
         auto replace_name = reuse_table.at(name);
         input_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
         node_append_idx++;
       }
     }
@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
       if (reuse_table.count(name) && reuse_table.at(name) != name) {
         auto replace_name = reuse_table.at(name);
         out_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
         node_append_idx++;
       }
     }
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index e7c44d2be689a9d890158c097e198314413d1ba3..45b15812fadb0789edea3f89fb00b4612bdb010f 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -85,7 +85,7 @@ class Node {
   struct Arg {
     std::string name;
     int id{0};
-    const Type* type{};
+    const Type* type{nullptr};
     // Weight is a special kind of argument, it is marked as weight explicitly
     // so that some weight related optimization can take place.
     bool is_weight{false};
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
index 54a4e779c6b6d0150cad966a4454f30624fe6dae..40cad8f6af75300ab85753b16e391daeeadc6c2f 100644
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply(
     }
     if (found) {
       inst.mutable_op_info()->SetAttr("output_scale", output_scale);
+    } else if (op_info->HasAttr("output_scale")) {
+      int bit_length = op_info->GetAttr<int>("bit_length");
+      int range = (1 << (bit_length - 1)) - 1;
+      output_scale = op_info->GetAttr<float>("output_scale");
+      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
     }
     if (op_info->HasAttr("output_scale")) {
       inst.mutable_op_info()->SetAttr("enable_int8", true);
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 6628444338830a35cb4ca78334398b0d4378bf3b..6d45be3b898271f0801d289d16235d3fb5fdd706 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
     }
 
     VLOG(4) << "[score(final)]:" << final_score;
-    VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+    VLOG(2) << "-------- pick summary for " << instruct.op_type()
+            << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
             << " " << DataLayoutToStr(winner_place.layout) << " "
             << TargetToStr(winner_place.target);
-    VLOG(4) << " ===> kernel.place():"
+    VLOG(2) << " ===> kernel.place():"
             << PrecisionToStr(kernel.place().precision) << " "
             << DataLayoutToStr(kernel.place().layout) << " "
             << TargetToStr(kernel.place().target);
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 943adc96b4be66eea1da6c71c189702834ccd295..91aa04d99505eac5fa9abc50a5008ec7b5de4fbf 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
     } else {
       exists_ops[op_type]++;
     }
-    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
     std::string op_color = "white";
     if (subgraph_indices.count(node)) {
       auto subgraph_idx = subgraph_indices[node];
-      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
       op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
     }
     dot.AddNode(op_name,
@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
   std::vector<std::string> lines = ReadLines(config_file_path);
 
   for (std::string line : lines) {
+    if (line.empty()) continue;
     std::vector<std::string> node_info = Split(line, ":");
     std::string op_type = node_info.at(0);
     std::vector<std::string> in_vars_name;
@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   cpp::OpDesc subgraph_op_desc;
   subgraph_op_desc.SetType("subgraph");
 
-  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // Create a new sub block desc for storing all of Ops and Vars of the target
   // subgraph and sub_block_idx is set as a attribute of subgraph op,
   // sub_block_idx < 0 means it's a new subgraph op
   int sub_block_idx = -(subgraph_idx + 1);
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index e96a080d574fbdf4dbf05d79c28e64b2148a98e2..974772a9839c1e089359be3ae98e1833645ccd7a 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
   CHECK_EQ(input_var_names.size(), 1);
   CHECK_EQ(wshape.size(), 2);
   static int id = 0;
-  std::string prefix = "fc_" + std::to_string(id);
+  std::string prefix = "fc_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
 
   auto* wgt = block_desc->AddVar<cpp::VarDesc>();
@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
     const std::vector<std::string>& input_Y_names) {
   // CHECK_EQ(input_var_names.size(), 2);
   static int id = 0;
-  std::string prefix = "elementwise_add_" + std::to_string(id);
+  std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
     const std::vector<std::string>& input_X_names) {
   // CHECK_EQ(input_var_names.size(), 1);
   static int id = 0;
-  std::string prefix = "feed_" + std::to_string(id);
+  std::string prefix = "feed_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
     const std::vector<std::string>& input_X_names) {
   // CHECK_EQ(input_var_names.size(), 1);
   static int id = 0;
-  std::string prefix = "fetch_" + std::to_string(id);
+  std::string prefix = "fetch_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 7cf65a23b8c5646c8ff6c77917dde53b7f036b9c..7117e1b3399fe823194f7f1a4d4c239099580955 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,6 +17,7 @@
 #include "lite/api/paddle_api.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
 
 DEFINE_string(model_file, "", "model file path of combined protobuf model");
 DEFINE_string(params_file, "", "params file path of combined protobuf model");
@@ -31,43 +32,17 @@ namespace lite {
 // The helper functions for loading and running model from command line and
 // verifying output data
 std::vector<std::string> TypeParsing(std::string text) {
-  std::vector<std::string> types;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string type = text.substr(0, index);
-    VLOG(3) << type;
-    types.push_back(type);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
-    }
-  }
-  return types;
+  return Split(text, ":");
 }
 
 std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
   std::vector<std::vector<int64_t>> shapes;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string slice = text.substr(0, index);
-    std::vector<int64_t> shape;
-    while (!slice.empty()) {
-      size_t index = slice.find_first_of(",");
-      int d = atoi(slice.substr(0, index).c_str());
-      VLOG(3) << d;
-      shape.push_back(d);
-      if (index == std::string::npos) {
-        break;
-      } else {
-        slice = slice.substr(index + 1);
-      }
-    }
-    shapes.push_back(shape);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
+  std::vector<std::string> shape_strings = Split(text, ":");
+  shapes.resize(shape_strings.size());
+  for (int i = 0; i < shape_strings.size(); i++) {
+    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
+    for (auto shape_num : shape_nums) {
+      shapes[i].push_back(atoi(shape_num.c_str()));
     }
   }
   return shapes;
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index f517a041200c32f04406bbcd877ed8484488e663..1133e5ba8203ec9fea177844a6311c993f6b8ff7 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
-    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
-            << " inlinks.size():" << inlinks.size();
+    VLOG(4) << "============== node->AsStmt().op_type():"
+            << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
+            << " ================";
     for (auto* in : inlinks) {
       ComplementInputs(graph.get(), node, in);
     }
@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
   CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
   auto decl_arg_type =
       inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
+
   CHECK(in->AsArg().type);
-  VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
+  VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
           << "\n in->AsArg().name:" << in->AsArg().name
           << "\n *in->AsArg().type:" << *in->AsArg().type
           << "\n *decl_arg_type:" << *decl_arg_type
           << "\n inst.op()->DebugString():" << inst.op()->DebugString();
 
+  // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
+  // layout).
+  // not a good judge, but don't find the source of this issue from
+  // static_pick_kernel_pass
+  // to this pass.
+  auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
+  if (in_arg_type->target() == TARGET(kARM) &&
+      in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
+    return;
+  }
+
   if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
     VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
             << " for kernel " << inst.op()->DebugString() << " "
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 25b367e73c4e27132b41cb2b5ec83b64a4bf226d..ecccf89fa76287a3f30756f7138fcce229e8f337 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   CHECK(in->IsArg());
   // auto node_id = [&] { return graph->nodes().size(); };
   auto cast_op_output_name = in->AsArg().name + "/precision_trans";
-  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // in->AsArg().name + "/precision_trans/" +
+  // paddle::lite::to_string(node_id());
   auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
   cast_op_output_arg->AsArg().type =
       LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 5dec9ed7aace837e3eb085a55d7b9b5382f7dea3..77d8091b4b16cfbce2efc3d549f916a9136c61ab 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -65,6 +65,7 @@ class OpLite : public Registry {
   virtual bool CheckShape() const { return true; }
   // Inference the outputs' shape.
   virtual bool InferShape() const { return true; }
+  virtual bool SmartInferShape() { return this->InferShape(); }
   // Run this operator.
   virtual bool Run();
   // Indicate whether the Op runs only once or not
@@ -150,6 +151,10 @@ class OpLite : public Registry {
   std::vector<Place> valid_places_;
   Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
   std::unique_ptr<OpInfo> op_info_;
+  std::vector<DDimLite> last_input_shapes;
+  std::vector<DDimLite> last_output_shapes;
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
+  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
 };
 
 /*
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index d9111e5c46c9217b181e5a3e5a8c7981f46250df..39213a33cebd05d9cfa50d82cdfb09ad3f7ad637 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,18 +22,25 @@
 #include <vector>
 #include "lite/core/program.h"
 
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/kernels/opencl/image_helper.h"
+#endif
+
 namespace paddle {
 namespace lite {
 namespace profile {
 
 template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
   if (locate.find('/') != std::string::npos) {
-    return;
+    return false;
   }
   FILE* fp = fopen(locate.c_str(), "w");
   if (fp == nullptr) {
     LOG(ERROR) << "file open field " << locate;
+    return false;
   } else {
     const dtype* data = tensor->data<dtype>();
     for (int i = 0; i < tensor->numel(); ++i) {
@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
     }
   }
   fclose(fp);
+  return true;
 }
 
 class PrecisionProfiler {
  public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
-  ~PrecisionProfiler() {
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
-              << PrecisionToStr(inst_->kernel()->precision());
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
-        return -99999;
-      }
-      double sum = 0.;
-      switch (ptype) {
+  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
+  // keep this method only for arm/math/conditional
+  explicit PrecisionProfiler(const Instruction* inst) {
+    std::string inst_precison_str = GetInstPrecision(inst);
+  }
+
+  PrecisionProfiler() {}
+
+  std::string GetSummaryHeader() {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    ss << "========================================= "
+       << "Detailed Precision Profiler Summary "
+       << "=========================================" << std::endl;
+    ss << setw(45) << left << "operator:(kernel_info)"
+       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
+       << " " << setw(15) << left << "dims"
+       << " " << setw(15) << left << "mean"
+       << " " << setw(15) << left << "std_deviation"
+       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+
+    return ss.str();
+  }
+
+  template <typename T>
+  double compute_mean(const T* in, const size_t length) {
+    double sum = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      sum += in[i];
+    }
+    return sum / length;
+  }
+
+  template <typename T>
+  double compute_standard_deviation(const T* in,
+                                    const size_t length,
+                                    bool has_mean = false,
+                                    double mean = 10000) {
+    if (!has_mean) {
+      mean = compute_mean<T>(in, length);
+    }
+
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      variance += pow((in[i] - mean), 2);
+    }
+    variance /= length;
+    return sqrt(variance);
+  }
+
+  template <typename T>
+  double compute_average_grow_rate(const T* in, const size_t length) {
+    const double eps = 1e-5;
+    double ave_grow_rate = 0.0f;
+    for (size_t i = 1; i < length; ++i) {
+      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+    }
+    ave_grow_rate /= length;
+    return ave_grow_rate;
+  }
+
+  // check if output tensor unused
+  bool is_unused(const Tensor* in) {
+    if (!in->data<int8_t>()) {
+      return true;
+    }
+    return false;
+  }
+
+  void compute_tensor_precision_info(const Tensor* in,
+                                     TargetType target_type,
+                                     PrecisionType precision_type,
+                                     DataLayoutType layout_type,
+                                     double* mean,
+                                     double* std_dev,
+                                     double* ave_grow_rate,
+                                     std::string name = "inst",
+                                     bool write_result_to_file = false) {
+    std::string unsupported_error_log =
+        "Unsupported precision profile for kernel registered on" +
+        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
+        DataLayoutToStr(layout_type);
+
+    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
+        target_type == TARGET(kX86)) {
+      switch (precision_type) {
         case PRECISION(kFloat): {
           auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
         }
         case PRECISION(kAny): {
           auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
         }
         case PRECISION(kInt8): {
           auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int8_t>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          return;
         }
         case PRECISION(kInt32): {
           auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int32_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int32_t>(
+              ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          return;
         }
         default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
-          return 0.;
+          *mean = -333333333333;
+          *std_dev = -33333333333;
+          *ave_grow_rate = -33333333333;
+          LOG(ERROR) << unsupported_error_log;
+          return;
       }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
-      auto kernel = inst_->kernel();
+#ifdef LITE_WITH_OPENCL
+    } else if (target_type == TARGET(kOpenCL)) {
+      switch (layout_type) {
+        case DATALAYOUT(kImageDefault): {
+          paddle::lite::CLImageConverterDefault default_convertor;
+          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          size_t im_w = image_shape[0];
+          size_t im_h = image_shape[1];
+          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
+                  << im_h;
+          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
+          std::vector<float> real_out_v(in->numel());
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          TargetWrapperCL::ImgcpySync(in_data_v.data(),
+                                      in->data<uint16_t, cl::Image2D>(),
+                                      im_w,
+                                      im_h,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          default_convertor.ImageToNCHW(
+              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
+          CHECK(real_out_v.size() == in->numel());
+          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
+          *std_dev = compute_standard_deviation<float>(
+              real_out_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
+                                                            real_out_v.size());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        case DATALAYOUT(kNCHW): {
+          std::vector<float> in_data_v(in->numel(), 0);
+          TargetWrapperCL::MemcpySync(in_data_v.data(),
+                                      in->data<float>(),
+                                      in->numel() * sizeof(float),
+                                      IoDirection::DtoH);
+          VLOG(1) << name << ":" << in->numel();
+          *mean = compute_mean<float>(in_data_v.data(), in->numel());
+          *std_dev = compute_standard_deviation<float>(
+              in_data_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate =
+              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        default:
+          *mean = -222222222222;
+          *std_dev = -22222222222;
+          *ave_grow_rate = -22222222222;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#endif
+    } else {
+      *mean = -111111111111;
+      *std_dev = -11111111111;
+      *ave_grow_rate = -11111111111;
+      LOG(ERROR) << unsupported_error_log;
+      return;
+    }
+  }
+
+  std::string GetInstPrecision(const Instruction* inst = nullptr) {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    bool write_result_to_file = false;
+
+    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
+            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
+            << PrecisionToStr(inst->kernel()->precision()) << "/"
+            << DataLayoutToStr(inst->kernel()->layout());
+
+    std::string kernel_repr = inst->op()->op_info()->Repr();
+    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
+                               PrecisionToStr(inst->kernel()->precision()) +
+                               "/" + DataLayoutToStr(inst->kernel()->layout());
+    std::string op_name = inst->op()->op_info()->Type();
+
+    if (inst->op()->op_info()->Type() != "fetch") {
+      auto op = const_cast<lite::OpLite*>(inst->op());
+      auto kernel = inst->kernel();
       auto op_scope = op->scope();
       auto out_names = op->op_info()->output_names();
       for (auto& out_name : out_names) {
@@ -106,32 +277,90 @@ class PrecisionProfiler {
         auto type = kernel->GetOutputDeclType(out_arg_name);
 
         if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision(), out_name);
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
-                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean << " shape:" << tout->dims();
+          const Tensor* tout =
+              op_scope->FindVar(out_name)->GetMutable<Tensor>();
+          double mean = -999999;
+          double std_dev = -100000;
+          double ave_grow_rate = 99999;
+          std::string mean_str{"unused"};
+          std::string std_dev_str{"unused"};
+          std::string ave_grow_rate_str{"unused"};
+
+          if (!is_unused(tout)) {
+            compute_tensor_precision_info(tout,
+                                          type->target(),
+                                          type->precision(),
+                                          type->layout(),
+                                          &mean,
+                                          &std_dev,
+                                          &ave_grow_rate,
+                                          out_name,
+                                          write_result_to_file);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+            ave_grow_rate_str = std::to_string(ave_grow_rate);
+          }
+          std::string kernel_info = op_name + ":" + kernel_place;
+          std::string output_arg_info = out_name + ":" +
+                                        TargetToStr(type->target()) + "/" +
+                                        PrecisionToStr(type->precision()) +
+                                        "/" + DataLayoutToStr(type->layout());
+
+          ss << setw(45) << left << kernel_info << " " << setw(70) << left
+             << output_arg_info << " " << setw(15) << left << tout->dims()
+             << " " << setw(15) << left << mean_str << " " << setw(15) << left
+             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+             << std::endl;
         } else if (type->IsTensorList()) {
-          auto tout =
+          auto touts =
               op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
-                      << ", precision: " << PrecisionToStr(type->precision())
-                      << ", mean value: " << mean;
+          for (auto t : *touts) {
+            const Tensor* tout = &t;
+            double mean = -999999;
+            double std_dev = -100000;
+            double ave_grow_rate = 99999;
+            std::string mean_str{"unused"};
+            std::string std_dev_str{"unused"};
+            std::string ave_grow_rate_str{"unused"};
+
+            if (!is_unused(tout)) {
+              compute_tensor_precision_info(tout,
+                                            type->target(),
+                                            type->precision(),
+                                            type->layout(),
+                                            &mean,
+                                            &std_dev,
+                                            &ave_grow_rate,
+                                            out_name,
+                                            write_result_to_file);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+              ave_grow_rate_str = std::to_string(ave_grow_rate);
+            }
+            std::string kernel_info = op_name + ":" + kernel_place;
+            std::string output_arg_info = out_name + ":" +
+                                          TargetToStr(type->target()) + "/" +
+                                          PrecisionToStr(type->precision()) +
+                                          "/" + DataLayoutToStr(type->layout());
+
+            ss << setw(45) << left << kernel_info << " " << setw(70) << left
+               << output_arg_info << " " << setw(15) << left << tout->dims()
+               << " " << setw(15) << left << mean_str << " " << setw(15) << left
+               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+               << std::endl;
           }
         }
       }
     }
+    return ss.str();
   }
-
- private:
-  const Instruction* inst_{nullptr};
 };
 
 }  // namespace profile
 }  // namespace lite
 }  // namespace paddle
 
+// TODO(ysh329): need to remove.
+// keep this method only for arm/math/conditional_block_compute
 #define LITE_PRECISION_PROFILE(inst) \
   { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 0895643a6adde0095f9d2892c41f263eedd4284f..580389fbad54c0de8efd65ef78c9b69fd3e72893 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }
 
 void RuntimeProgram::Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
+  std::string precision_profiler_summary =
+      inst_precision_profiler.GetSummaryHeader();
+#endif
+#endif
+
   for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-    LITE_PRECISION_PROFILE(inst)
+    precision_profiler_summary +=
+        inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
   LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#ifdef LITE_WITH_PRECISION_PROFILE
+  LOG(INFO) << "\n" << precision_profiler_summary;
+#endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
 }
 
@@ -274,7 +286,8 @@ void Instruction::Run() {
     return;
   }
 
-  op_->InferShape();
+  // op_->InferShape();
+  op_->SmartInferShape();
   kernel_->Launch();
   has_run_ = true;
 }
diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h
index edcbb101aa5ddb090cc585a16597967cb5114936..fbee253872237bce08f3f67b948da79becbae21a 100644
--- a/lite/core/program_fake_utils.h
+++ b/lite/core/program_fake_utils.h
@@ -30,9 +30,9 @@ Program FakeProgram() {
 
   auto add_fc = [&](int id, std::string x) {
     // create variables
-    std::string w1 = "w" + std::to_string(id);
-    std::string b1 = "b" + std::to_string(id);
-    std::string out1 = "out" + std::to_string(id);
+    std::string w1 = "w" + paddle::lite::to_string(id);
+    std::string b1 = "b" + paddle::lite::to_string(id);
+    std::string out1 = "out" + paddle::lite::to_string(id);
     auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
     auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
     auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
diff --git a/lite/core/version.h.in b/lite/core/version.h.in
index d34c32073b852a50b5d26984ed4812ac4f38a870..da2d5f3ed99631973d97a94741e1711391237261 100644
--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -53,9 +53,9 @@ static std::string version() {
 static int64_t int_version(const std::string& version) {
   const std::vector<std::string> vec = Split(version, ".");
   if (vec.size() == 3) {
-    return std::stoi(vec[0]) * MAJOR_COEFF +
-           std::stoi(vec[1]) * MINOR_COEFF +
-           std::stoi(vec[2]) * PATCH_COEFF;
+    return atoi(vec[0].c_str()) * MAJOR_COEFF +
+           atoi(vec[1].c_str()) * MINOR_COEFF +
+           atoi(vec[2].c_str()) * PATCH_COEFF;
   }
   return -1;
 }
diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc
index 67014aef9d1797312bffc05712b57357c4d8204c..09a9c0ee158e7d5913a78877711d831fc5738cf1 100644
--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file,
     cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);
 
     // uncomment two lines below, save roi img to disk
-    // std::string roi_name = "roi_" + std::to_string(i) + ".jpg";
+    // std::string roi_name = "roi_" + paddle::lite::to_string(i)
+    // + ".jpg";
     // imwrite(roi_name, roi);
 
     // Do PreProcess
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index 3eaf63e7f9be80cf36c475476c644516bbc75fbd..150bcd231c27c25d8510fc8dfa3281a8351514dd 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -14,6 +14,7 @@
 
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) {
   return shape_str;
 }
 
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
 inline double GetCurrentUS() {
   struct timeval time;
   gettimeofday(&time, NULL);
@@ -101,24 +128,24 @@ void RunModel(std::string model_dir,
   // 5. Get output
   std::cout << "\n====== output summary ====== " << std::endl;
   size_t output_tensor_num = predictor->GetOutputNames().size();
-  std::cout << "output tesnor num:" << output_tensor_num << std::endl;
+  std::cout << "output tensor num:" << output_tensor_num << std::endl;
 
   for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
     std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
         predictor->GetOutput(tidx);
     std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
     auto out_shape = output_tensor->shape();
-    std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl;
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, ShapeProduction(out_shape), true, out_mean);
 
-    float sum = 0.f;
-    for (int i = 0; i < ShapeProduction(out_shape); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
+    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
     std::cout << "output tensor " << tidx
               << " elem num:" << ShapeProduction(out_shape) << std::endl;
-    std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl;
     std::cout << "output tensor " << tidx
-              << " mean value:" << sum / ShapeProduction(out_shape)
+              << " standard deviation:" << out_std_dev << std::endl;
+    std::cout << "output tensor " << tidx << " mean value:" << out_mean
               << std::endl;
 
     // print output
diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc
index 0d8f4d0d192f3563d00bb66778ca4e13a17b93b1..6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b 100644
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,
 
     switch (type) {
       case AttrType::INT:
-        return std::to_string(desc.GetAttr<int>(name));
+        return paddle::lite::to_string(desc.GetAttr<int>(name));
       case AttrType::FLOAT:
-        return std::to_string(desc.GetAttr<float>(name));
+        return paddle::lite::to_string(desc.GetAttr<float>(name));
       case AttrType::BOOLEAN:
-        return std::to_string(desc.GetAttr<bool>(name));
+        return paddle::lite::to_string(desc.GetAttr<bool>(name));
       case AttrType::STRING:
         return "\"" + desc.GetAttr<std::string>(name) + "\"";
       case AttrType::FLOATS: {
diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h
index 58a7959f4eb34cb438bf0e25b49b36110435cc6b..d316eac43f99664fa71cba54b3ab5360852300a0 100644
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -153,16 +153,16 @@ class Module {
 
  private:
   std::string WeightUniqueName() const {
-    return "w_" + std::to_string(weight_counter_++);
+    return "w_" + paddle::lite::to_string(weight_counter_++);
   }
   std::string TmpVarUniqueName() const {
-    return "tmp_" + std::to_string(tmp_var_counter_++);
+    return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
   }
   std::string OpUniqueName() const {
-    return "op_" + std::to_string(op_counter_++);
+    return "op_" + paddle::lite::to_string(op_counter_++);
   }
   std::string KernelUniqueName() const {
-    return "kernel_" + std::to_string(kernel_counter_++);
+    return "kernel_" + paddle::lite::to_string(kernel_counter_++);
   }
 
   std::string DataRepr(const std::string &raw_data, PrecisionType dtype);
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 514d6069b5db9f1cf0fdd5d8a87a7cf89411dd23..7550d770145d92ebd343f96a82c6f34d72c91ea5 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,6 +1,6 @@
 # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
 # to the model_optimize_tool.
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
     return()
 endif()
 
@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de
 if(LITE_WITH_TRAIN)
   add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
   add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
   add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
 endif()
 
diff --git a/lite/kernels/arm/elementwise_grad_compute.cc b/lite/kernels/arm/elementwise_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93bc5853459005137ef4f948f3a5892d76441b7c
--- /dev/null
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+void ElementwiseAddGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_grad_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (!param.XGrad) {
+    CHECK(param.YGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+
+  if (!param.YGrad) {
+    CHECK(param.XGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    return;
+  }
+
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, y_grad_data, x_grad_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+  }
+}
+
+void ElementwiseSubGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+
+  if (!param.XGrad || !param.YGrad) {
+    CHECK(param.XGrad || param.YGrad);
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
+  }
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_grad_broadcast(
+        out_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, x_dims.production());
+  }
+}
+
+template <typename T, PrecisionType PType>
+void ElementwiseMulGradCompute<T, PType>::Run() {
+  LOG(FATAL) << "elementwise mul_grad not implement yet";
+}
+
+void ElementwiseMaxGradCompute::Run() {
+  LOG(FATAL) << "elementwise max_grad not implement yet";
+}
+
+void ElementwiseDivGradCompute::Run() {
+  LOG(FATAL) << "elementwise div_grad not implement yet";
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using elementwise_mul_grad_float =
+    paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
+                                                          PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(elementwise_add_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseSubGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseDivGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/elementwise_grad_compute.h b/lite/kernels/arm/elementwise_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1273d8317410ce6689637e28597f9867702e1c2c
--- /dev/null
+++ b/lite/kernels/arm/elementwise_grad_compute.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ElementwiseAddGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddGradCompute() = default;
+};
+
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+
+template <typename T, PrecisionType PType>
+class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMulGradCompute() = default;
+};
+
+class ElementwiseMaxGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMaxGradCompute() = default;
+};
+
+class ElementwiseDivGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/mul_grad_compute.cc b/lite/kernels/arm/mul_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..405d61d2ac3e4e060234eac63173e5bdd898d2ae
--- /dev/null
+++ b/lite/kernels/arm/mul_grad_compute.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void MulGradCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+
+void MulGradCompute::Run() {
+  // step1 flatten_2d
+  auto& param = Param<param_t>();
+  const auto x_dims = param.x->dims();
+  const auto y_dims = param.y->dims();
+  const auto out_dims = param.output_grad->dims();
+
+  m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
+
+  k_ = static_cast<int>(
+      x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
+  n_ = static_cast<int>(
+      y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
+
+  const auto* out_grad_data = param.output_grad->data<float>();
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.x_grad) {
+    x_grad_data = param.x_grad->mutable_data<float>();
+  }
+
+  if (param.y_grad) {
+    y_grad_data = param.y_grad->mutable_data<float>();
+  }
+
+  paddle::lite::operators::ActivationParam act_param;
+  act_param.has_active = false;
+  // out_grad  * y^T = x_grad
+  // (m, n), (n, k) -> (m, k)
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (param.x_grad) {
+    if (m_ == 1) {
+      lite::arm::math::sgemv(y_data,
+                             out_grad_data,
+                             x_grad_data,
+                             false,
+                             k_,  // M
+                             n_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(false,
+                                     true,           // is_transB,
+                                     m_,             // M
+                                     k_,             // N
+                                     n_,             // K
+                                     1.0f,           // alpha
+                                     out_grad_data,  // A
+                                     n_,             // lda
+                                     y_data,         // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     x_grad_data,    // C
+                                     k_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+
+  // x^T * out_grad = y_grad
+  // (k, m) (m, n) -> (k, n)
+  if (param.y_grad) {
+    if (n_ == 1) {
+      lite::arm::math::sgemv(x_data,
+                             out_grad_data,
+                             y_grad_data,
+                             true,
+                             k_,  // M
+                             m_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(true,           // is_transA
+                                     false,          // is_transB,
+                                     k_,             // M
+                                     n_,             // N
+                                     m_,             // K
+                                     1.0f,           // alpha
+                                     x_data,         // A
+                                     k_,             // lda
+                                     out_grad_data,  // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     y_grad_data,    // C
+                                     n_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(mul_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MulGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/mul_grad_compute.h b/lite/kernels/arm/mul_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cdaff3f10ce0a3c0a9509765f858c7371a75f0c
--- /dev/null
+++ b/lite/kernels/arm/mul_grad_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulGradParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulGradCompute() = default;
+
+ private:
+  int m_, n_, k_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/utility.cc b/lite/kernels/bm/bridges/utility.cc
index aa61462d046e1d21b49517a6362b54a884a6b6de..ffbefa137b9c9caab388fcee865469cea87b83e4 100644
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
     counter = ++(it->second);
   }
 
-  return prefix + "_" + std::to_string(counter);
+  return prefix + "_" + paddle::lite::to_string(counter);
 }
 
 bool HasInputArg(const OpInfo* op_info,
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 9ec335ce81bff6e69fbc5b12914110a445f0afb6..3fb3136bfc0787f9d8e539039811d25559919f4e 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
     return()
 endif()
 
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479..1f9b84e7db0b98ce45e620cb1840842ba397953e 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
     return()
 endif()
 
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
index cc4a7e2a7ce062090ca890d90e21aa643e37a0d3..67d8a2b1cc708f7530532840df3e71770b5a3695 100644
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -87,7 +87,8 @@ class Graph {
     auto idx = Add(name, node);
     CHECK_GE(idx, 1);
     // Generate a unique name for the created HiAI IR
-    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
     return node;
   }
 
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 44786220d7dd7fa24e012073e63935d6c824eb98..ef2bdb68fa9988b6a1985a34d22320193256de7b 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   split_op->create_dynamic_output_y(out_names.size());
   int idx = 1;
   for (auto& out_name : out_names) {
-    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto zero_node =
+        graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
     auto add_node = graph->Add<ge::op::Add>(out_name);
     auto add_op = add_node->data<ge::op::Add>();
-    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x1(*split_node->data(),
+                         "y" + paddle::lite::to_string(idx));
     add_op->set_input_x2(*zero_node->data());
     idx++;
   }
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index 770ea345b633034972cb71cb4f1236ecefff36d7..d7b14a9319951eb827cbc9d346ee8e59e9571aee 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
       << "[NPU] No input nodes found for building NPU model";
   CHECK(!device_onames_.empty())
       << "[NPU] No output nodes found for building NPU model";
+
   // Build the HiAI IR graph to HiAI om model as the device program
-  device_program_ = lite::npu::Device::Global().Build(
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client = lite::npu::Device::Global().Build(
       model_name_, device_inodes, device_onodes);
-  if (device_program_ == nullptr) {
+  if (device_client == nullptr) {
     LOG(WARNING) << "[NPU] Build model failed!";
     return subgraph::FAILED;
   }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;
 
   // Query and check the dimensions of valid input and output tensors
   std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program_->GetModelIOTensorDim(
+  if (device_program->client->GetModelIOTensorDim(
           model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
     LOG(WARNING)
         << "[NPU] Get the dimensions of input and output tensors failed!";
     return subgraph::FAILED;
   }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
+
   CHECK_EQ(device_idims.size(), device_inames_.size());
   CHECK_EQ(device_odims.size(), device_onames_.size());
   origin_idims_.resize(device_inames_.size());
@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_odims_.resize(device_onames_.size());
   origin_otensors_.resize(device_onames_.size());
   device_otensors_.resize(device_onames_.size());
+
   for (int i = 0; i < device_inames_.size(); i++) {
     auto node = graph.Get(device_inames_[i]);
     auto precision = node->precision();
@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_itensors_[i].reset(new hiai::AiTensor);
     device_itensors_[i]->Init(&(device_idims[i]));
   }
+  device_program->origin_idims = origin_idims_;
+
   for (int i = 0; i < device_onames_.size(); i++) {
     auto node = graph.Get(device_onames_[i]);
     auto precision = node->precision();
@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
                    << PrecisionToStr(precision);
         break;
     }
+    device_program->origin_odims = origin_odims_;
+
     CHECK_EQ(origin_odims_[i].production(),
              device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                  device_odims[i].GetHeight() * device_odims[i].GetWidth());
@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {
 
 int SubgraphEngine::LaunchDeviceProgram() {
   // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  // init device_itensors_, device_otensors_, origin_otensors_
+  auto device_program = device_program_map_[inputs_shape_];
   for (size_t i = 0; i < device_itensors_.size(); i++) {
+    device_itensors_[i]->Init(&(device_program->device_idims[i]));
     std::memcpy(device_itensors_[i]->GetBuffer(),
                 origin_itensors_[i]->raw_data(),
                 origin_itensors_[i]->memory_size());
   }
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    device_otensors_[i]->Init(&(device_program->device_odims[i]));
+  }
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+  }
+
   // Run the HiAI model by name
   std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
+  hiai::AiContext model_context;
+  model_context.AddPara(key, model_name_);
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
   };
   int istamp;
   auto start_time = GetCurrentUS();
-  CHECK_EQ(
-      device_program_->Process(
-          model_context_, device_itensors_, device_otensors_, 1000, istamp),
-      hiai::AI_SUCCESS);
+  CHECK_EQ(device_program->client->Process(
+               model_context, device_itensors_, device_otensors_, 1000, istamp),
+           hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+
   // Copy the data of output HiAI tensor to the buffer of origin output tensors
   for (size_t i = 0; i < device_otensors_.size(); i++) {
     std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
   return 0;
 }
 
+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
+
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 29aeb01cdb50e2a9dd6d066a2f11106fd4cb20fb..801f61b0365c03d59c36e2a62ac3c2bb61f46607 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
       : subgraph::Engine(
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
+        : client(_client) {}
+    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<hiai::TensorDimension> device_idims{};
+    std::vector<hiai::TensorDimension> device_odims{};
+  };
+
  protected:
   int BuildDeviceProgram() override;
   int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;
 
-  std::string model_name_;
-  hiai::AiContext model_context_;
-  std::vector<std::string> device_inames_;
-  std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
-  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+  std::string model_name_{"model.om"};
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
+      device_program_map_{};
+  std::vector<std::string> device_inames_{};
+  std::vector<std::string> device_onames_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
 };
 
 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 25afb2fc399c6a4da8775440c1602031061267f7..652ce2593828c5131c0e3192db0a45a490b3cbc6 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
     return ()
 endif()
 
@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
 #             DEPS conv_opencl op_registry program context)
 
+#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
+#             DEPS tensor cl_context cl_wrapper cl_target_wrapper)
+
 #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
 #             DEPS depthwise_conv2d_opencl op_registry program context)
 
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index d24275f24b6503c777178257ae45305a7abdb02c..dbe487ba91d00c2de4c08edf140526d727bac6b5 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -101,6 +101,7 @@ class ActivationComputeImageDefault
     status = kernel.setArg(++arg_idx, scale_);
     CL_CHECK_FATAL(status);
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
     VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
@@ -112,6 +113,7 @@ class ActivationComputeImageDefault
     VLOG(4) << "threshold:" << threshold_;
     VLOG(4) << "scale:" << scale_;
     VLOG(4) << "kernel func name:" << kernel_func_name_;
+#endif
 
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL(
 
 // exp
 REGISTER_LITE_KERNEL(
-    exp_act,
+    exp,
     kOpenCL,
     kFP16,
     kImageDefault,
@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL(
 
 // tanh
 REGISTER_LITE_KERNEL(
-    tanh_act,
+    tanh,
     kOpenCL,
     kFP16,
     kImageDefault,
diff --git a/lite/kernels/opencl/activation_image_compute_test.cc b/lite/kernels/opencl/activation_image_compute_test.cc
index 40751a44b2b81dae387e2614f281b4a5e4a7bace..2f30ec6743fd488fc88f0b9f9d6544b3ca7642bf 100644
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) {
                     func_name = "sigmoid";
                     break;
                   case 6:  // tanh
-                    func_name = "tanh_act";
+                    func_name = "tanh";
                     break;
                   case 7:  // tanh
                     func_name = "swish";
                     break;
                   case 8:  // tanh
-                    func_name = "exp_act";
+                    func_name = "exp";
                     break;
                 }
                 LOG(INFO) << "func_name: " << func_name;
@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
 
 // exp
-USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);
 
 // swish
 USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
 USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
 
 // tanh act
-USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);
 
 // relu image2d fp16
 USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index eeab8b043b3344b492fd9bafc3259e8d8ed08438..7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -77,17 +77,21 @@ class BilinearInterpImageCompute
     int out_h = out_dims[2];
     int out_w = out_dims[3];
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
+#endif
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
     auto* x_img = x->data<half_t, cl::Image2D>();
-    // VLOG(4) << "x_image: " << x_img;
 
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "x_image: " << x_img;
     // VLOG(4) << "out_image: " << out_img;
     VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
             << out_image_shape["height"];
@@ -96,6 +100,7 @@ class BilinearInterpImageCompute
             << ", align_delta: " << align_delta;
     VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
     VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
@@ -107,8 +112,10 @@ class BilinearInterpImageCompute
                         DDim(std::vector<DDim::value_type>{
                             static_cast<int64_t>(out_image_shape["width"]),
                             static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
             << default_work_size[1] << ", " << default_work_size[2];
+#endif
     cl_int status = kernel.setArg(arg_idx++, *x_img);
     CL_CHECK_FATAL(status);
     status = kernel.setArg(arg_idx++, *out_img);
@@ -142,9 +149,10 @@ class BilinearInterpImageCompute
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
+#endif
   }
 
  protected:
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index f1b0cb21bb8ea68248c3caabb1146bbff461c6c9..95e64025662a4b87cd68c211ccc0b0fb7b84a9f2 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
     int arg_idx = 0;
     int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
 
-    VLOG(4) << "concat 输入尺寸:  ";
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "concat input shape:  ";
     for (size_t i = 0; i < inputs.size(); i++) {
       VLOG(4) << "inputs [" << i << "]"
               << "[" << inputs[i]->dims().size() << "D]:"
@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
               << inputs[i]->dims()[3];
     }
 
-    VLOG(4) << "concat 输出尺寸:  ";
+    VLOG(4) << "concat output shape:  ";
     VLOG(4) << " out  dims:  "
             << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
             << " " << x_dims[2] << " " << x_dims[3];
     VLOG(4) << "axis_: " << axis_;
     VLOG(4) << "flag_: " << flag_;
+#endif
 
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
                                                x_dims[x_dims.size() - 1]),
                     static_cast<cl::size_type>(image_shape["height"])};
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << TargetToStr(param.output->target());
     VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
             << image_shape["height"];
@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
     VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
             << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
             << (image_shape["height"]);
+#endif
 
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     int out_w = x_dims[x_dims.size() - 1];
@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
         image_shape = InitImageDimInfoWith(in_dims);
         auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
         int in_w = in_dims[in_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
         VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
                 << image_shape["height"];
+#endif
         global_work_size =
             cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
                         static_cast<cl::size_type>(image_shape["width"] /
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index 8a6017d1ad295b7ae833cd15de47655e669e5b79..d664e37150fcc661e4bb97ed57a42364dd0d475d 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -78,6 +78,7 @@ void ConvImageCompute::PrepareForRun() {
   VLOG(3) << "dilation_equal:" << dilation_equal;
   VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
           << paddings[2] << " " << paddings[3];
+
   CHECK(pad_equal && stride_equal && dilation_equal);
 
   if (kernel_h == 1 && kernel_w == 1) {
@@ -85,9 +86,9 @@ void ConvImageCompute::PrepareForRun() {
     if (param.x->dims()[1] % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
-      kernel_func_names_.push_back("conv2d_1x1");
+      kernel_func_names_.push_back("conv2d_1x1_opt");
     }
-    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+    kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
@@ -97,7 +98,7 @@ void ConvImageCompute::PrepareForRun() {
     filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
-    impl_ = &ConvImageCompute::Conv2d1x1;
+    impl_ = &ConvImageCompute::Conv2d1x1opt;
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
   } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
@@ -141,9 +142,10 @@ void ConvImageCompute::PrepareForRun() {
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
-  } else if (kernel_h == 3 && kernel_h == 3) {
+  } else if (kernel_w == 3 && kernel_h == 3) {
     // conv2d_3x3
-    kernel_func_names_.push_back("conv2d_3x3_opt");
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
+                                        : "conv2d_3x3_opt");
     kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
 
     CLImageConverterFolder converter;
@@ -156,6 +158,8 @@ void ConvImageCompute::PrepareForRun() {
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
   } else if (kernel_h == 5 && kernel_w == 5) {
+#define CONV_5x5_OPT
+#ifndef CONV_5x5_OPT
     // conv2d_5x5
     kernel_func_names_.push_back("conv2d_5x5");
     kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
@@ -169,7 +173,27 @@ void ConvImageCompute::PrepareForRun() {
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5;
+#else
+    // conv2d_5x5_opt
+
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
+                                        : "conv2d_5x5_opt");
+    kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d5x5opt;
+#endif
+#undef CONV_5x5_OPT
   } else if (kernel_h == 7 && kernel_w == 7) {
+#define CONV_7x7_OPT
+#ifndef CONV_7x7_OPT
     // conv2d_7x7
     kernel_func_names_.push_back("conv2d_7x7");
     kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
@@ -183,6 +207,25 @@ void ConvImageCompute::PrepareForRun() {
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7;
+
+#else
+    // conv2d_7x7
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
+                                        : "conv2d_7x7_opt");
+    kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d7x7opt;
+#endif
+#undef CONV_7x7_OPT
+
   } else {
     LOG(FATAL) << "conv image compute not support this condition yet! ";
   }
@@ -229,7 +272,7 @@ void ConvImageCompute::PrepareForRun() {
   }
 }
 
-void ConvImageCompute::Conv2d1x1() {
+void ConvImageCompute::Conv2d1x1opt() {
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
@@ -269,6 +312,7 @@ void ConvImageCompute::Conv2d1x1() {
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d_1x1 params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
@@ -290,7 +334,7 @@ void ConvImageCompute::Conv2d1x1() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
-
+#endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
   CHECK_GE(input_dims.size(), 4);
@@ -313,10 +357,12 @@ void ConvImageCompute::Conv2d1x1() {
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
   int maped_w = maptofactor(w, 4);
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
   VLOG(4) << "maped_w: " << maped_w;
   VLOG(4) << "hasbias: " << has_bias;
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -363,21 +409,27 @@ void ConvImageCompute::Conv2d1x1() {
                   static_cast<size_t>(maped_w),
                   static_cast<size_t>(default_work_size.data()[2])};
 
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   size_t max_work_group_size = 0;
   kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   &max_work_group_size);
   cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
   if (max_work_group_size > 0 && use_lws) {
     local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                           max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
             << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
   }
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -453,6 +505,7 @@ void ConvImageCompute::Conv2d3x3() {
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
@@ -477,6 +530,7 @@ void ConvImageCompute::Conv2d3x3() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -496,9 +550,12 @@ void ConvImageCompute::Conv2d3x3() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_names_[0] << build_options_[0];
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
   VLOG(4) << "w: " << w;
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -513,7 +570,9 @@ void ConvImageCompute::Conv2d3x3() {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -553,9 +612,11 @@ void ConvImageCompute::Conv2d3x3() {
                   static_cast<size_t>(default_work_size.data()[1]),
                   static_cast<size_t>(default_work_size.data()[2])};
 
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
@@ -586,7 +647,8 @@ void ConvImageCompute::Conv2d3x3opt() {
   int output_width = output_dims[3];
   int output_height = output_dims[2];
   int output_channel = output_dims[1];
-
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
   auto out_image_shape = InitImageDimInfoWith(output_dims);
   auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
@@ -611,8 +673,9 @@ void ConvImageCompute::Conv2d3x3opt() {
 
   int h_blk_size = 1;
   int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  // default_work_size[2] = h_blk;
+// default_work_size[2] = h_blk;
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
   //         << input_image_shape["height"];
@@ -632,6 +695,7 @@ void ConvImageCompute::Conv2d3x3opt() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -651,8 +715,11 @@ void ConvImageCompute::Conv2d3x3opt() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_names_[0] << build_options_[0];
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -667,7 +734,9 @@ void ConvImageCompute::Conv2d3x3opt() {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -681,6 +750,8 @@ void ConvImageCompute::Conv2d3x3opt() {
 
   status = kernel.setArg(++arg_idx, dilations[0]);
   CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_channel);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_width);
@@ -696,22 +767,27 @@ void ConvImageCompute::Conv2d3x3opt() {
       cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
                   static_cast<size_t>(w_blk),
                   static_cast<size_t>(h_blk)};
-
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   size_t max_work_group_size = 0;
   kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   &max_work_group_size);
   cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
   if (max_work_group_size > 0 && use_lws) {
     local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                           max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
             << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
   }
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -767,6 +843,7 @@ void ConvImageCompute::Conv2d5x5() {
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
@@ -789,6 +866,7 @@ void ConvImageCompute::Conv2d5x5() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -808,9 +886,12 @@ void ConvImageCompute::Conv2d5x5() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_names_[0] << build_options_[0];
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
   VLOG(4) << "w: " << w;
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -825,7 +906,9 @@ void ConvImageCompute::Conv2d5x5() {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -855,9 +938,11 @@ void ConvImageCompute::Conv2d5x5() {
                   static_cast<size_t>(default_work_size.data()[1]),
                   static_cast<size_t>(default_work_size.data()[2])};
 
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
@@ -870,6 +955,172 @@ void ConvImageCompute::Conv2d5x5() {
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
+void ConvImageCompute::Conv2d5x5opt() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 5;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  // default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+// default_work_size[2] = h_blk;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+#endif
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, h_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, paddings[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(w_blk),
+                  static_cast<size_t>(h_blk)};
+
+//  VLOG(4) << "out_image: " << out_image;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
+  size_t max_work_group_size = 0;
+  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                  CL_KERNEL_WORK_GROUP_SIZE,
+                                  &max_work_group_size);
+  cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
+  if (max_work_group_size > 0 && use_lws) {
+    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
+                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
+            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
+  }
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      local_work_size,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Conv2d7x7() {
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
@@ -912,6 +1163,7 @@ void ConvImageCompute::Conv2d7x7() {
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
@@ -934,6 +1186,7 @@ void ConvImageCompute::Conv2d7x7() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -953,9 +1206,12 @@ void ConvImageCompute::Conv2d7x7() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_names_[0] << build_options_[0];
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
   VLOG(4) << "w: " << w;
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -970,7 +1226,9 @@ void ConvImageCompute::Conv2d7x7() {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1000,9 +1258,11 @@ void ConvImageCompute::Conv2d7x7() {
                   static_cast<size_t>(default_work_size.data()[1]),
                   static_cast<size_t>(default_work_size.data()[2])};
 
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
@@ -1014,7 +1274,167 @@ void ConvImageCompute::Conv2d7x7() {
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
 }
+void ConvImageCompute::Conv2d7x7opt() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
 
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 5;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  // default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+// default_work_size[2] = h_blk;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d 7x7 params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+#endif
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, h_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, paddings[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(w_blk),
+                  static_cast<size_t>(h_blk)};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
+  size_t max_work_group_size = 0;
+  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                  CL_KERNEL_WORK_GROUP_SIZE,
+                                  &max_work_group_size);
+  cl::NDRange local_work_size = cl::NullRange;
+  if (max_work_group_size > 0 && use_lws) {
+    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
+                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
+            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
+  }
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      local_work_size,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
 void ConvImageCompute::DepthwiseConv2d3x3s1() {
   const auto& param = *param_.get_mutable<param_t>();
   auto x_dims = param.x->dims();
@@ -1071,7 +1491,9 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
     bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1099,12 +1521,16 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   &max_work_group_size);
   cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
   if (max_work_group_size > 0 && use_lws) {
     local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                           max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
             << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
   }
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -1153,6 +1579,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   int nh = output_dims[0] * output_dims[2];
   auto global_work_size = cl::NDRange(c_block, w, nh);
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "setArg";
   VLOG(4) << "c_block = " << c_block;
   VLOG(4) << "w = " << w;
@@ -1166,6 +1593,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   VLOG(4) << "x_dims[2] = " << x_dims[2];
   VLOG(4) << "output_dims[3] = " << output_dims[3];
   VLOG(4) << "output_dims[2] = " << output_dims[2];
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -1185,7 +1613,9 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
     bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1261,6 +1691,7 @@ void ConvImageCompute::DepthwiseConv2d() {
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ depthwise conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
@@ -1282,6 +1713,7 @@ void ConvImageCompute::DepthwiseConv2d() {
   VLOG(4) << "default work size{c_block, w, nh}: "
           << "{" << c_block << ", " << w << ", " << nh << ""
           << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -1303,9 +1735,12 @@ void ConvImageCompute::DepthwiseConv2d() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_names_[0] << build_options_[0];
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
   VLOG(4) << "w: " << w;
+#endif
 
   cl_int status;
   int arg_idx = 0;
@@ -1320,7 +1755,9 @@ void ConvImageCompute::DepthwiseConv2d() {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1354,9 +1791,11 @@ void ConvImageCompute::DepthwiseConv2d() {
                   static_cast<size_t>(default_work_size.data()[1]),
                   static_cast<size_t>(default_work_size.data()[2])};
 
+#ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
           << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 3f8db82f4a6b3f7cf0abad3cdac4198fd0b516d5..57e4b91e0a842487fc5dfce4799fab244348772d 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   void Run() override;
 
  private:
-  void Conv2d1x1();
+  void Conv2d1x1opt();
   void Conv2d3x3();
   void Conv2d3x3opt();
   void Conv2d5x5();
+  void Conv2d5x5opt();
   void Conv2d7x7();
+  void Conv2d7x7opt();
   void DepthwiseConv2d3x3s1();
   void DepthwiseConv2d3x3();
   void DepthwiseConv2d();
diff --git a/lite/kernels/opencl/conv_image_compute_test.cc b/lite/kernels/opencl/conv_image_compute_test.cc
index 0d76ef11eef0f7f784354d841c116e0adb19d306..5563265198a992bcf0b4fbb6e22168e8aeb50e33 100644
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
   const int dilation = 1;
   const int stride = 2;
   const int group = 1;
-  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
     for (int oc = 1; oc < 10; oc += 1) {   // oc
       for (int ih = 5; ih < 9; ih += 1) {  // ih
         int iw = ih;
@@ -532,7 +532,7 @@ const int stride = 2;
 #else  // big scale with group
   const int stride = 1;
   const int group = 32 / 1;
-  const int batch_size = 1;
+  const int batch_size = 2;
   const int ic = 32 / 1;
   const int ih = 112 / 1;
   const int iw = 112 / 1;
@@ -558,7 +558,8 @@ const int stride = 2;
                                                   PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(kernels.empty());
-              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+              //              CHECK(batch_size == 1) << "conv3x3 only supprt
+              //              batch_size == 1";
 
               auto kernel = std::move(kernels.front());
               SHADOW_LOG << "created conv2d kernel";
@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) {
 //  int loop_cnt = 0;
 
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {   // oc
-      for (int ih = 5; ih < 9; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 5; oc += 1) {    // oc
+      for (int ih = 5; ih < 8; ih += 1) {  // ih
         int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
+        for (int ic = 2; ic < 6; ic += 1) {  // ic
           for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {/*true,*/ "relu"}) {
+            for (std::string relu_flag : {""
+                                          "relu"}) {
 #else
   const int batch_size = 2;
   const int oc = 1;
@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {
 
               SHADOW_LOG << "gen input and filter ...";
               for (auto& i : input_v) {
-                i = 0.01 * gen(engine);
+                i = 0.5 * gen(engine);
               }
               for (auto& f : filter_v) {
-                f = 0.01 * gen(engine);
+                f = 0.5 * gen(engine);
               }
 
               SHADOW_LOG << "after gen input and filter ...";
@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) {
 #undef LOOP_TEST
 #undef PRINT_RESULT
 #endif
+
 #ifdef TEST_CONV_IMAGE_7x7
-#undef FP16_ABS_DIFF
-#define FP16_ABS_DIFF (1e0)
+// #undef FP16_ABS_DIFF
+// #define FP16_ABS_DIFF (1e-1)
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_7x7) {
   // conv infos
@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) {
 //  int loop_cnt = 0;
 
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {    // oc
-      for (int ih = 7; ih < 15; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 6; oc += 1) {    // oc
+      for (int ih = 7; ih < 8; ih += 1) {  // ih
         int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
-          for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {"relu"}) {
+        for (int ic = 2; ic < 4; ic += 1) {  // ic
+          for (bool bias_flag : {false, true}) {
+            for (std::string relu_flag : {"", "relu"}) {
 #else
   const int batch_size = 2;
   const int oc = 1;
@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) {
 
               SHADOW_LOG << "gen input and filter ...";
               for (auto& i : input_v) {
-                i = gen(engine);
+                i = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
                 i = 1;
 #endif
               }
+              int fiii = 1;
               for (auto& f : filter_v) {
-                f = gen(engine);
+                f = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
+                // f = fiii++;
                 f = 1;
 #endif
               }
@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) {
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                              filter_dim);
 
-              //              auto* filter_image2d = filter.mutable_data<float,
+              //              auto* filter_image2d =
+              // filter.mutable_data < float,
               //              cl::Image2D>(
               //                  filter_image_width,
               //                  filter_image_height,
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index 5dff529fb4fbfec023996b0169e948d597afa78e..3961ac7583917fdcd761614558c493e6917d3294 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() {
   STL::stringstream kernel_key;
   kernel_key << kernel_func_name_ << build_options_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << TargetToStr(ele_param_->X->target());
   VLOG(4) << TargetToStr(ele_param_->Y->target());
   VLOG(4) << TargetToStr(ele_param_->Out->target());
+#endif
   int arg_idx = 0;
   cl_int status = kernel.setArg(arg_idx, *x_buf);
   CL_CHECK_FATAL(status);
@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() {
   for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
     num_ *= x_dims[i];
   }
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "axis: " << axis;
   VLOG(4) << "batch: " << batch_;
   VLOG(4) << "channels: " << channels_;
   VLOG(4) << "num: " << num_;
+#endif
 }
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index e9015ab16044f9346bbaa997e4f47dfbcd9bb023..6d0ebf638f0a8967e27a657131e1cac89967ee0b 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
   VLOG(4) << "y->target():" << TargetToStr(y->target());
   VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
+#endif
 
   paddle::lite::CLImageConverterDefault default_convertor;
   auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() {
   auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                          out_img_shape[1]);
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
   VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
   VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
           << out_img_shape[1];
+#endif
 
   STL::stringstream kernel_key;
   kernel_key << kernel_func_name_ << build_options_;
@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
   } else if (y_dims.size() == 1) {
     if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
       int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
-
+#endif
       cl_int status = kernel.setArg(arg_idx, *x_img);
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, *y_img);
@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index c5e43616f957695aa598b9f383135bf603eb42b4..78a025566f24cb604910eb3766cb05c8647e1e03 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -56,7 +56,7 @@ class ElementwiseMulImageCompute
       } else {
         kernel_func_name_ = "channel_mul_d2_hw";
       }
-    } else if (y_dims.size() == 4) {
+    } else if (y_dims.size() == 4 || x_dims.size() == 4) {
       kernel_func_name_ = "channel_mul_d4";
     } else {
       LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
@@ -80,12 +80,14 @@ class ElementwiseMulImageCompute
     auto* y = ele_param_->Y;
     auto* out = ele_param_->Out;
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "y->target():" << TargetToStr(y->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << x->dims();
     VLOG(4) << "y->dims():" << y->dims();
     VLOG(4) << "out->dims():" << out->dims();
+#endif
 
     paddle::lite::CLImageConverterDefault default_convertor;
     auto x_img_shape =
@@ -101,10 +103,12 @@ class ElementwiseMulImageCompute
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                            out_img_shape[1]);
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
     VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
     VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
             << out_img_shape[1];
+#endif
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
@@ -123,7 +127,9 @@ class ElementwiseMulImageCompute
       CL_CHECK_FATAL(status);
     } else if (y_dims.size() == 1 || y_dims.size() == 4) {
       auto tensor_w = x_dims[x_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
+#endif
       // kernel: channel_mul_d1 / channel_mul_d4
       cl_int status = kernel.setArg(arg_idx, *x_img);
       CL_CHECK_FATAL(status);
@@ -136,7 +142,9 @@ class ElementwiseMulImageCompute
     } else if (y_dims.size() == 2) {
       if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
         auto tensor_w = x_dims[x_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
         VLOG(4) << "tensor_w:" << tensor_w;
+#endif
         // kernel: channel_mul_d2_nc
         cl_int status = kernel.setArg(arg_idx, *x_img);
         CL_CHECK_FATAL(status);
@@ -149,7 +157,9 @@ class ElementwiseMulImageCompute
       } else {
         auto y_tensor_h = y->dims()[0];
         auto y_tensor_w = y->dims()[1];
+#ifndef LITE_SHUTDOWN_LOG
         VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+#endif
         // kernel: channel_mul_d2_hw
         cl_int status = kernel.setArg(arg_idx, *x_img);
         CL_CHECK_FATAL(status);
@@ -162,6 +172,18 @@ class ElementwiseMulImageCompute
         status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
         CL_CHECK_FATAL(status);
       }
+    } else if (x_dims.size() == 4) {
+      auto tensor_w = y_dims[y_dims.size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      // kernel: channel_mul_d4
+      cl_int status = kernel.setArg(arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
     } else {
       LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
                  << y_dims.size();
@@ -179,8 +201,9 @@ class ElementwiseMulImageCompute
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
   }
 
  protected:
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 3a18501dfb38d3a11f432751c6abd51ce1c7a180..0bc867d7f124582660b7a0a9a95d026d910fc2d3 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -62,6 +62,7 @@ void ElementwiseSubImageCompute::Run() {
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
   VLOG(4) << "y->target():" << TargetToStr(y->target());
   VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -69,6 +70,7 @@ void ElementwiseSubImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
+#endif
 
   paddle::lite::CLImageConverterDefault default_convertor;
   auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -83,10 +85,12 @@ void ElementwiseSubImageCompute::Run() {
   auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                          out_img_shape[1]);
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
   VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
   VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
           << out_img_shape[1];
+#endif
 
   STL::stringstream kernel_key;
   kernel_key << kernel_func_name_ << build_options_;
@@ -104,8 +108,9 @@ void ElementwiseSubImageCompute::Run() {
   } else if (y_dims.size() == 1) {
     if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
       int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
-
+#endif
       cl_int status = kernel.setArg(arg_idx, *x_img);
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, *y_img);
@@ -127,7 +132,10 @@ void ElementwiseSubImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
+
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index e174286ca1fefa3c56bca04b433015ac769cfcbf..243737a81331a7159834d30ccfb2fab181baeebe 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
     auto out_dims = out->dims();
     auto in_dims = x->dims();
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
+#endif
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
     auto* x_img = x->data<half_t, cl::Image2D>();
@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
 
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
     // VLOG(4) << "out_image" << out_img;
     VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
             << out_image_shape["height"];
-
+#endif
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
                         DDim(std::vector<DDim::value_type>{
                             static_cast<int64_t>(out_image_shape["width"]),
                             static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
             << default_work_size[1] << ", " << default_work_size[2];
+#endif
     cl_int status = kernel.setArg(arg_idx++, *x_img);
     CL_CHECK_FATAL(status);
     status = kernel.setArg(arg_idx++, *grid_img);
@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
+#endif
   }
 
  protected:
diff --git a/lite/backends/opencl/cl_im2col_test.cc b/lite/kernels/opencl/im2col_buffer_test.cc
similarity index 100%
rename from lite/backends/opencl/cl_im2col_test.cc
rename to lite/kernels/opencl/im2col_buffer_test.cc
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index d90acdb02d75958b72d986453d0fe6adacb43c0f..176b4149b2656c6197f43336753bc53d5fb18769 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -89,19 +89,23 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     int in_h = in_dims[2];
     int in_w = in_dims[3];
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
+#endif
 
     auto out_image_shape = InitImageDimInfoWith(in_dims);
     auto* x_img = x->data<half_t, cl::Image2D>();
-
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
             << out_image_shape["height"];
 
     VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
+#endif
 
     int threads = 512;
     int group_size_x = (channel + 3) / 4;
@@ -113,10 +117,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
         cl::NDRange{static_cast<cl::size_type>(group_size_x * threads),
                     static_cast<cl::size_type>(group_size_y),
                     static_cast<cl::size_type>(1)};
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "local_work_size:[2D]:" << local_work_size[0] << " "
             << local_work_size[1] << " " << local_work_size[2];
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
+#endif
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index 0e9a5941c0a3484ffbb72012f64c07296694078b..6a49cc2577a58690e5e0b6a6ede82df0bdc99bb1 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -42,11 +42,13 @@ class IoCopyHostToOpenCLCompute
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kARM));
     auto mem_size = param.x->memory_size();
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "param.x->memory_size():" << mem_size;
     VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
     VLOG(2) << "param.x->dims():" << param.x->dims();
     VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
     VLOG(2) << "param.y->dims():" << param.y->dims();
+#endif
     auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
     CopyFromHostSync(data, param.x->raw_data(), mem_size);
   }
@@ -85,12 +87,14 @@ class IoCopykOpenCLToHostCompute
     CHECK(param.x->target() == TARGET(kOpenCL));
     auto mem_size = param.x->memory_size();
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "copy size " << mem_size;
     VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
     VLOG(2) << "param.x->dims():" << param.x->dims();
     VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
     VLOG(2) << "param.y->dims():" << param.y->dims();
     VLOG(2) << "param.process_type:" << param.process_type;
+#endif
 
     auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
     const cl::Buffer* x_ptr;
@@ -104,7 +108,9 @@ class IoCopykOpenCLToHostCompute
     auto* wait_list = context.cl_wait_list();
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
+#ifndef LITE_SHUTDOWN_LOG
       VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
+#endif
       auto& event = *(it->second);
       event.wait();
     } else {
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
index 9ddaf9c6e5afd549ff950e2f708bc8336bed8f52..22b3533e123bc248b0ec59df593cd51fe0ad1391 100644
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -74,6 +74,7 @@ class LayoutComputeBufferChwToImageDefault
     const int Stride1 = out_H * out_W;
     const int Stride0 = out_W;
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "param.process_type:" << param.process_type;
     VLOG(2) << "x_dims:" << x_dims;
     VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
@@ -89,6 +90,7 @@ class LayoutComputeBufferChwToImageDefault
     VLOG(2) << "Stride2:" << Stride2;
     VLOG(2) << "Stride1:" << Stride1;
     VLOG(2) << "Stride0:" << Stride0;
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
@@ -177,6 +179,7 @@ class LayoutComputeImageDefaultToBufferChw
       new_dims[4 - x_dims.size() + j] = x_dims[j];
     }
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "param.process_type:" << param.process_type;
     VLOG(2) << "x_dims:" << x_dims;
     VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
@@ -186,6 +189,7 @@ class LayoutComputeImageDefaultToBufferChw
             << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
     VLOG(2) << "y_dims:" << y_dims;
     VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
+#endif
 
     size_t C = new_dims[1];
     size_t in_height = new_dims[2];
@@ -217,8 +221,10 @@ class LayoutComputeImageDefaultToBufferChw
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(C));
     CL_CHECK_FATAL(status);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
             << " " << (new_dims[0] * new_dims[2]);
+#endif
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index bb19e044ae4a7b296fbace00797b0c05521c8adb..edce0368ddc9cda54fdab44b472fcd0e771413ae 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -65,6 +65,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
     auto out_dims = out->dims();
     auto in_dims = x->dims();
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target(): " << TargetToStr(x->target());
     VLOG(4) << "out->target(): " << TargetToStr(out->target());
     VLOG(4) << "x->dims(): " << in_dims;
@@ -74,6 +75,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
     VLOG(4) << "alpha: " << alpha_;
     VLOG(4) << "beta: " << beta_;
     VLOG(4) << "norm_region: " << norm_region_;
+#endif
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
     auto* x_img = x->data<half_t, cl::Image2D>();
@@ -81,9 +83,12 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
 
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
     // VLOG(4) << "out_image" << out_img;
     VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
             << out_image_shape["height"];
+#endif
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
@@ -97,8 +102,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                         DDim(std::vector<DDim::value_type>{
                             static_cast<int64_t>(out_image_shape["width"]),
                             static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
             << default_work_size[1] << ", " << default_work_size[3];
+#endif
     cl_int status = kernel.setArg(arg_idx++, *x_img);
     CL_CHECK_FATAL(status);
     status = kernel.setArg(arg_idx++, *out_img);
@@ -130,9 +137,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
+#endif
   }
 
  protected:
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index c34019161000bf25522c061194815e38932ba4d2..082f21ab1ae792ae33e9e2a368073274258b8884 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -87,6 +87,7 @@ class NearestInterpComputeImageDefault
     status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
     CL_CHECK_FATAL(status);
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
     VLOG(4) << "out_image_shape(w,h):" << out_image_shape["width"] << " "
@@ -95,6 +96,7 @@ class NearestInterpComputeImageDefault
             << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
     VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
             << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+#endif
 
     const std::vector<size_t>& default_work_size =
         DefaultWorkSize(y_dims,
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index 7f4838149d1e2364baf0b1b2286fef4a74ee9a4b..1be4729ee1b24ac77383de4d7c111e9d37d29d6b 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -71,10 +71,12 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
     int out_h = out_dims[2];
     int out_w = out_dims[3];
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
+#endif
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
     auto* x_img = x->data<half_t, cl::Image2D>();
@@ -82,11 +84,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
     auto* out_img = out->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
             << out_image_shape["height"];
 
     VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
     VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
@@ -98,9 +102,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                         DDim(std::vector<DDim::value_type>{
                             static_cast<int64_t>(out_image_shape["width"]),
                             static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
             << default_work_size[1] << ", " << default_work_size[2];
-
+#endif
     int pad_h0 = pad2d_param_->paddings[0];
     int pad_h1 = pad2d_param_->paddings[1];
     int pad_w0 = pad2d_param_->paddings[2];
@@ -144,9 +149,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
+#endif
   }
 
  protected:
diff --git a/lite/kernels/opencl/pad2d_image_compute_test.cc b/lite/kernels/opencl/pad2d_image_compute_test.cc
index d1e1e3bb4c8fc80fabacff52b66f20387dd7766f..c2371d07f31caf569cfe4b299bf2f88373eb3b9f 100644
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -89,7 +89,7 @@ void pad2d_ref(const float *x_data,
   }
 }
 
-#define LOOP_TEST
+// #define LOOP_TEST
 // #define PRINT_RESULT
 TEST(pad2d_image2d, compute) {
   LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index c2a8f7c7cf87ba709beb5f30a0149dc2cd92d11b..39da325ebb10c85f153e349173aa833bbf5e1f6e 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -59,10 +59,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     std::vector<int> paddings = *param.paddings;
     std::vector<int> strides = param.strides;
     std::vector<int> ksize = param.ksize;
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_pooling: " << global_pooling;
     VLOG(4) << "pooling_type: " << pooling_type;
     VLOG(4) << "paddings : " << paddings[0] << "  " << paddings[1] << "  "
             << paddings[2] << "  " << paddings[3] << "  ";
+#endif
+
     if (global_pooling) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[2 * i] = 0;
@@ -70,6 +74,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
         ksize[i] = static_cast<int>(in_dims[i + 2]);
       }
     }
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << "  "
             << in_dims[1] << "  " << in_dims[2] << "  " << in_dims[3];
     VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << "  "
@@ -82,6 +88,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
             << ksize[1] << "  " << ksize[2] << "  " << ksize[3];
     VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << "  "
             << paddings[1] << "  " << paddings[2] << "  " << paddings[3];
+#endif
+
     bool pads_equal =
         (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
     if (!pads_equal) {
@@ -95,8 +103,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     //    VLOG(4) << "x_image" << x_img;
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
             << out_image_shape["height"];
+#endif
     auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
     //    VLOG(4) << "out_image" << out_img;
@@ -109,8 +119,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     int w = out_dims[3];
     int nh = out_dims[0] * out_dims[2];
     auto global_work_size = cl::NDRange(c_block, w, nh);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size : [" << 3 << "]" << c_block << "  " << w
             << "  " << nh << "  ";
+#endif
     cl_int status;
     int arg_idx = 0;
     status = kernel.setArg(arg_idx, *x_img);
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index 557259e29fd73964d62b150f25ea89b1f5b16908..376add226216a57a0868c9c52497b784929a207e 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -41,8 +41,6 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
   }
 
   void Run() override {
-    VLOG(4) << "reshape_compute run ... ";
-
     auto& param = *param_.get_mutable<param_t>();
     const Tensor* const x = param.x;
 
@@ -64,8 +62,9 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
         InitImageDimInfoWith(out_dims);
     cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
         out_image_shape.at("width"), out_image_shape.at("height"));
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_dims=   " << out_dims;
-
+#endif
     const std::vector<size_t>& default_work_size = DefaultWorkSize(
         out_dims,
         DDim(std::vector<DDim::value_type>{
@@ -94,6 +93,8 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     int out_Stride0 = out_W;
     int out_Stride1 = out_H * out_W;
     int out_Stride2 = out_C * out_H * out_W;
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_C=" << out_C;
     VLOG(4) << "out_H=" << out_H;
     VLOG(4) << "out_W=" << out_W;
@@ -104,17 +105,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     VLOG(4) << "in_Stride1=" << in_Stride1;
     VLOG(4) << "out_Stride0=" << out_Stride0;
     VLOG(4) << "out_Stride1=" << out_Stride1;
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << TargetToStr(x->target());
     VLOG(4) << TargetToStr(param.output->target());
+#endif
 
     int arg_idx = 0;
-
     cl_int status;
     status = kernel.setArg(arg_idx, *x_image);
     CL_CHECK_FATAL(status);
@@ -199,8 +203,8 @@ REGISTER_LITE_KERNEL(reshape2,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFP16),
@@ -217,7 +221,7 @@ REGISTER_LITE_KERNEL(flatten,
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFP16),
@@ -235,7 +239,7 @@ REGISTER_LITE_KERNEL(flatten2,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFP16),
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index e15373b147593e0fe9d3cb53e36a77beb0779324..5fd9a2b46b5ce3b0ad84449785f510d5f0391250 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -51,8 +51,10 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
     //    LOG(INFO) << "x_image" << x_img;
     auto out_image_shape = InitImageDimInfoWith(in_dims);
-    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
-              << out_image_shape["height"];
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
     auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
     //    LOG(INFO) << "out_image" << out_img;
diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc
index 43aaad3402b7873dbaa67d4c4897b5378e098500..4af8a2bd3464efaaec6937996445736068f0f656 100644
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -49,7 +49,7 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
   CHECK_GE(idx, 1);
   node->set_data(std::make_shared<xtcl::xExpr>(layer));
   // Generate a unique name for the current XTCL layer
-  builder_.SetLayer(name + "__" + std::to_string(idx));
+  builder_.SetLayer(name + "__" + paddle::lite::to_string(idx));
   return node;
 }
 
diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc
index 8b6ffb4dcf481bbb8df92e7e15c1d569d575bcae..98789e8006817fceb4745bffd0c095da7ad360fc 100644
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
@@ -155,7 +155,7 @@ TEST(ListBuilder, basic) {
 
   for (int i = 0; i < num_elems; i++) {
     auto* elem = li.New();
-    elem->set("elem-" + std::to_string(i));
+    elem->set("elem-" + paddle::lite::to_string(i));
   }
   li.Save();
   table.SaveToFile("2.bf");
@@ -169,7 +169,7 @@ TEST(ListBuilder, basic) {
   li1.Load();
 
   for (int i = 0; i < num_elems; i++) {
-    ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i));
+    ASSERT_EQ(li1.Get(i).data(), "elem-" + paddle::lite::to_string(i));
   }
 }
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 34c7b8d6669b4eddfa6fecaa67cf4523b5c36566..48e27560317c089446e8dbc5040786f34ca962c4 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -144,6 +144,8 @@ add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
 if (LITE_WITH_TRAIN)
   add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
   add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
   add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
 endif()
 
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 9ae52d1cb6a406dc8d1059ad97f3757dbc0a31fa..70ad3a32a83003e449524205a71dcc7536b9a11e 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -80,6 +80,34 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
   }
 }
 
+bool ConvOpLite::SmartInferShape() {
+  if (!last_input_shapes.empty()) {
+    if (last_input_shapes[0] == param_.x->dims() &&
+        last_input_lods[0] == param_.x->lod()) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.x->dims());
+  last_input_lods.push_back(param_.x->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+
+  return true;
+}
 bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
@@ -104,9 +132,9 @@ bool ConvOpLite::InferShape() const {
 
   // Set output dims
   param_.output->Resize(lite::DDim(output_shape));
-
   // share LoD
-  // param_.output->set_lod(param_.x->lod());
+  param_.output->set_lod(param_.x->lod());
+
   return true;
 }
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index 63107022f1ef69a21d37373c4a257625f8b0f5e3..3379fb409529e261f4af38ef2ee3483f17cc8a3b 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -36,6 +36,7 @@ class ConvOpLite : public OpLite {
   bool CheckShape() const override;
 
   bool InferShape() const override;
+  bool SmartInferShape() override;
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
diff --git a/lite/operators/elementwise_grad_ops.cc b/lite/operators/elementwise_grad_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d964bf9e36889f2bc72b2656d23bf4022cc121c
--- /dev/null
+++ b/lite/operators/elementwise_grad_ops.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/elementwise_grad_ops.h"
+#include <algorithm>
+#include <cmath>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ElementwiseGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.XGrad || param_.YGrad);
+  CHECK_OR_FALSE(param_.OutGrad);
+  return true;
+}
+
+bool ElementwiseGradOp::InferShape() const {
+  auto x_dim = param_.X->dims();
+  auto y_dim = param_.Y->dims();
+  if (param_.XGrad) {
+    param_.XGrad->Resize(x_dim);
+  }
+  if (param_.YGrad) {
+    param_.YGrad->Resize(y_dim);
+  }
+  return true;
+}
+
+bool ElementwiseGradOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                   lite::Scope* scope) {
+  auto Y_name = opdesc.Input("Y").front();
+  auto X_name = opdesc.Input("X").front();
+  auto Out_name = opdesc.Input("Out@GRAD").front();
+  CHECK(!opdesc.Output("X@GRAD").empty() || !opdesc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  if (!opdesc.Output("X@GRAD").empty()) {
+    auto x_grad_name = opdesc.Output("X@GRAD").front();
+    param_.XGrad = GetMutableVar<lite::Tensor>(scope, x_grad_name);
+  }
+  if (!opdesc.Output("Y@GRAD").empty()) {
+    auto y_grad_name = opdesc.Output("Y@GRAD").front();
+    param_.YGrad = GetMutableVar<lite::Tensor>(scope, y_grad_name);
+  }
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.OutGrad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(elementwise_sub_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_add_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+
+REGISTER_LITE_OP(elementwise_grad_mul,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_grad_max,
+                 paddle::lite::operators::ElementwiseGradOp);
diff --git a/lite/operators/elementwise_grad_ops.h b/lite/operators/elementwise_grad_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c45d581936207f0b37ee70a0505b912d0b509e35
--- /dev/null
+++ b/lite/operators/elementwise_grad_ops.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ElementwiseGradOp : public OpLite {
+ public:
+  explicit ElementwiseGradOp(const std::string& op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "elementwise_grad_op"; }
+
+ private:
+  mutable operators::ElementwiseGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 3dc6f06955d421bc1f25994139cfee5dee9bc472..044126b3c22fa853d4908c06c307f32278fa5b9b 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -26,7 +26,38 @@ bool ElementwiseOp::CheckShape() const {
   CHECK_OR_FALSE(param_.Out);
   return true;
 }
+bool ElementwiseOp::SmartInferShape() {
+  if (!last_input_shapes.empty()) {
+    if (last_input_shapes[0] == param_.X->dims() &&
+        last_input_shapes[1] == param_.Y->dims() &&
+        last_input_lods[0] == param_.X->lod() &&
+        last_input_lods[1] == param_.Y->lod()) {
+      param_.Out->Resize(last_output_shapes[0]);
+      param_.Out->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
 
+  last_input_shapes.push_back(param_.X->dims());
+  last_input_lods.push_back(param_.X->lod());
+  last_input_shapes.push_back(param_.Y->dims());
+  last_input_lods.push_back(param_.Y->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.Out->dims());
+  last_output_lods.push_back(param_.Out->lod());
+  return true;
+}
 bool ElementwiseOp::InferShape() const {
   auto x_dim = param_.X->dims();
   auto y_dim = param_.Y->dims();
@@ -81,6 +112,7 @@ bool ElementwiseOp::InferShape() const {
     auto out_lod = param_.Out->mutable_lod();
     *out_lod = param_.X->lod();
   }
+
   return true;
 }
 
diff --git a/lite/operators/elementwise_ops.h b/lite/operators/elementwise_ops.h
index d888e3d1c14b5d3129e01d12c75e1f590c17f297..9d6e5781b9754eb22be11da0d7f77b764eb25912 100644
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -28,6 +28,7 @@ class ElementwiseOp : public OpLite {
   bool CheckShape() const override;
 
   bool InferShape() const override;
+  bool SmartInferShape() override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index eff9300fea4caf412186bfc8d0ad136686507be5..345fc0d605ccd68e3a6ef72429e20400a772568c 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -48,6 +48,33 @@ bool FcOpLite::CheckShape() const {
   return true;
 }
 
+bool FcOpLite::SmartInferShape() {
+  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
+    if (last_input_shapes[0] == param_.input->dims() &&
+        last_input_lods[0] == param_.input->lod()) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.input->dims());
+  last_input_lods.push_back(param_.input->lod());
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+
+  return true;
+}
 bool FcOpLite::InferShape() const {
   const auto& input_dims = param_.input->dims();
   const auto& w_dims = param_.w->dims();
@@ -64,6 +91,7 @@ bool FcOpLite::InferShape() const {
 
   // share LoD
   param_.output->set_lod(param_.input->lod());
+
   return true;
 }
 
diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h
index ec449cd4bdc33f191c33fc04f215ad672b283215..f5dc302e27a220ee1f1e0679cbb3c2ed257747dd 100644
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -36,6 +36,7 @@ class FcOpLite : public OpLite {
   bool CheckShape() const override;
 
   bool InferShape() const override;
+  bool SmartInferShape() override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/mul_grad_op.cc b/lite/operators/mul_grad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8215521637cbc29a4bdcc4b735b9658fc4cc4840
--- /dev/null
+++ b/lite/operators/mul_grad_op.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/mul_grad_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MulGradOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.y);
+  CHECK_OR_FALSE(param_.output_grad);
+  CHECK_OR_FALSE(param_.x_grad || param_.y_grad);
+  CHECK_OR_FALSE(param_.x_num_col_dims);
+  CHECK_OR_FALSE(param_.y_num_col_dims);
+
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  const auto out_dims = param_.output_grad->dims();
+
+  CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
+  CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
+
+  auto x_flatten_dims = flatten_2d(x_dims, param_.x_num_col_dims);
+  auto y_flatten_dims = flatten_2d(y_dims, param_.y_num_col_dims);
+  auto out_flatten_dims = flatten_2d(out_dims, param_.x_num_col_dims);
+
+  // Out = X * Y;
+  CHECK_EQ_OR_FALSE(x_flatten_dims[1], y_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(x_flatten_dims[0], out_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(y_flatten_dims[1], out_flatten_dims[1]);
+  return true;
+}
+
+bool MulGradOpLite::InferShape() const {
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  if (param_.x_grad) {
+    param_.x_grad->Resize(x_dims);
+    param_.x_grad->set_lod(param_.x->lod());
+  }
+  if (param_.y_grad) {
+    param_.y_grad->Resize(y_dims);
+    param_.y_grad->set_lod(param_.y->lod());
+  }
+}
+
+bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Input("Out@GRAD").empty());
+  CHECK(!op_desc.Output("X@GRAD").empty() || !op_desc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  auto *x_var = scope->FindVar(op_desc.Input("X").front());
+  CHECK(x_var);
+  param_.x = &x_var->Get<Tensor>();
+
+  auto *y_var = scope->FindVar(op_desc.Input("Y").front());
+  CHECK(y_var);
+  param_.y = &y_var->Get<Tensor>();
+
+  auto *out_grad_var = scope->FindVar(op_desc.Input("Out@GRAD").front());
+  CHECK(out_grad_var);
+  param_.output_grad = &out_grad_var->Get<Tensor>();
+
+  if (!op_desc.Output("X@GRAD").empty()) {
+    auto *x_grad_var = scope->FindVar(op_desc.Output("X@GRAD").front());
+    CHECK(x_grad_var);
+    param_.x_grad = x_grad_var->GetMutable<Tensor>();
+  }
+
+  if (!op_desc.Output("Y@GRAD").empty()) {
+    auto *y_grad_var = scope->FindVar(op_desc.Output("Y@GRAD").front());
+    CHECK(y_grad_var);
+    param_.y_grad = y_grad_var->GetMutable<Tensor>();
+  }
+  param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
+  param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
diff --git a/lite/operators/mul_grad_op.h b/lite/operators/mul_grad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef61f54f9b88cd691ab98c4d8904b848dcea66b5
--- /dev/null
+++ b/lite/operators/mul_grad_op.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MulGradOpLite : public OpLite {
+ public:
+  MulGradOpLite() {}
+
+  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  std::string DebugString() const override { return "mul_grad"; }
+
+ private:
+  mutable MulGradParam param_;
+};
+
+std::vector<int64_t> flatten_2d(DDim dims, int num_col_dims) {
+  std::vector<int64_t> flatten_dims{1, 1};
+  for (int i = 0; i < dims.size(); i++) {
+    if (i < num_col_dims) {
+      flatten_dims[0] *= dims[i];
+    } else {
+      flatten_dims[1] *= dims[i];
+    }
+  }
+  return flatten_dims;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h
index e53168e00e0e541e6b317e1633a8afbf33018d6e..caf7bf6ae902ac4e4f22d4a9aadfa108fa7622da 100644
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -66,28 +66,6 @@ class MulOpLite : public OpLite {
   mutable MulParam param_;
 };
 
-#ifdef LITE_WITH_TRAIN
-class MulGradOpLite : public OpLite {
- public:
-  MulGradOpLite() {}
-
-  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "mul_grad"; }
-
- private:
-  mutable MulGradParam param_;
-};
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 6d18f1bf348530fc111499ca7cbb89e9bec88d9d..36d3b42c6b315a3858f475bd5756579137528051 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -387,10 +387,11 @@ struct ElementwiseParam {
 };
 
 struct ElementwiseGradParam {
+  const lite::Tensor* X{};
   const lite::Tensor* Y{};
-  const lite::Tensor* Out_grad{};
-  lite::Tensor* X_grad{};
-  lite::Tensor* Y_grad{};
+  const lite::Tensor* OutGrad{};
+  lite::Tensor* XGrad{};
+  lite::Tensor* YGrad{};
   int axis{-1};  // for broadcasting.
 };
 
diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc
index 1e89fc1a2af407ebbe11f207bd33a1dabb811dc0..0989c9139763a435d67deb21a2ab233e1c2f3bd9 100644
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -29,10 +29,39 @@ bool SoftmaxOp::CheckShape() const {
   return true;
 }
 
+bool SoftmaxOp::SmartInferShape() {
+  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
+    if (param_.x->dims() == last_input_shapes[0] &&
+        param_.x->lod() == last_input_lods[0]) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.x->dims());
+  last_input_lods.push_back(param_.x->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+  return true;
+}
+
 bool SoftmaxOp::InferShape() const {
   param_.output->Resize(param_.x->dims());
   auto out_lod = param_.output->mutable_lod();
   *out_lod = param_.x->lod();
+
   return true;
 }
 
diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h
index bb24acad344f02fe3677484fd2c4c31326683a13..c65d039fda02c5396eff829bede3b4ffdeac0051 100644
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
@@ -31,6 +31,7 @@ class SoftmaxOp : public OpLite {
   bool CheckShape() const override;
 
   bool InferShape() const override;
+  bool SmartInferShape() override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 41e7c619489cdb974b238f6584032cc778f9e919..f4afe9ee3c3c0f9b325ac55a0c2c6a6454617e57 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -65,6 +65,8 @@ if(LITE_BUILD_EXTRA)
     if (LITE_WITH_TRAIN)
         lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
         lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
         lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     endif()
 
diff --git a/lite/tests/kernels/concat_compute_test.cc b/lite/tests/kernels/concat_compute_test.cc
index 3e30035f1011405ad9beffefd0df91132747a609..18e4701bdf3e99fbb6f76ed9ac78bbbbfda60a1c 100644
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -128,7 +128,7 @@ class ConcateComputeTester : public arena::TestCase {
       for (int i = 0; i < x_dims_.production(); i++) {
         x_data[i] = static_cast<float>(i + n);
       }
-      const std::string x_name = "x_tensor_" + std::to_string(n);
+      const std::string x_name = "x_tensor_" + paddle::lite::to_string(n);
       x_vct_.push_back(x_name);
       SetCommonTensor(x_name, x_dims_, x_data.data());
     }
diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488
--- /dev/null
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
@@ -0,0 +1,541 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/elementwise_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::ElementwiseParam;
+using grad_param_t = operators::ElementwiseGradParam;
+using kernel_add_t = ElementwiseAddCompute;
+using grad_kernel_add_t = ElementwiseAddGradCompute;
+using kernel_sub_t = ElementwiseSubCompute;
+using grad_kernel_sub_t = ElementwiseSubGradCompute;
+
+void elementwise_common(grad_param_t& param,           // NOLINT
+                        std::vector<float>& out_grad,  // NOLINT
+                        std::vector<float>& x_grad,    // NOLINT
+                        std::vector<float>& y_grad,    // NOLINT
+                        std::string flag) {
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  if (x_dims == y_dims) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      if (flag == "add") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = out_grad[i];
+      }
+      if (flag == "sub") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = -out_grad[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupport dims";
+  }
+}
+
+class ElementwiseAddGradTester {
+ public:
+  explicit ElementwiseAddGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_add_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_add_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "add");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_add_t kernel_;
+  grad_kernel_add_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+
+class ElementwiseSubGradTester {
+ public:
+  explicit ElementwiseSubGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_sub_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_sub_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "sub");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_sub_t kernel_;
+  grad_kernel_sub_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int axis) {
+  std::unique_ptr<ElementwiseAddGradTester> tester_add(
+      new ElementwiseAddGradTester(DDim(x_dims), DDim(y_dims), axis));
+  std::unique_ptr<ElementwiseSubGradTester> tester_sub(
+      new ElementwiseSubGradTester(DDim(x_dims), DDim(y_dims), axis));
+
+  tester_add->prepare_kernel();
+  tester_sub->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester_add->check_grad(delta, max_grad_delta);
+  tester_sub->check_grad(delta, max_grad_delta);
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Elementwise grad";
+  DeviceInfo::Init();
+  TestNormalCase({3, 2}, {3, 2}, 0);
+  TestNormalCase({3, 5}, {3, 5}, 1);
+  TestNormalCase({3, 4, 3}, {3, 4, 3}, 0);
+  TestNormalCase({9, 2, 5}, {9, 2, 5}, 1);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_add_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
index 465b7becffe967ab77d0c1a237fe6a4951031b3a..bc2cfce7842c935898bd9ecddc6c2d0ac4c39af5 100644
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -52,7 +52,8 @@ class FillConstantComputeTester : public arena::TestCase {
         is_use_shape_tensor_list_(is_use_shape_tensor_list) {
     if (is_use_shape_tensor_list) {
       for (int i = 0; i < shape.size(); i++) {
-        shape_tensor_list_.push_back(shape_tensor_ + std::to_string(i));
+        shape_tensor_list_.push_back(shape_tensor_ +
+                                     paddle::lite::to_string(i));
       }
     }
   }
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
index d9bbfaa8d049cf2bbcdea9b9c5e58d201e156a67..d070292332b65ed577ec6cefdb220ee691eb99e9 100644
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -109,6 +109,7 @@ void TestMul(const std::vector<int64_t>& x_dims,
              int y_num_col_dims,
              const Place& place,
              float abs_error) {
+  LOG(INFO) << "run test arm";
   std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
                                                                "def",
                                                                DDim(x_dims),
@@ -131,7 +132,6 @@ TEST(Mul, precision) {
 #else
   return;
 #endif
-
   TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
   TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
   TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
diff --git a/lite/tests/kernels/mul_grad_compute_test.cc b/lite/tests/kernels/mul_grad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95cbb2f8b54dd41d6756f7ae0222a34a7bb18c1d
--- /dev/null
+++ b/lite/tests/kernels/mul_grad_compute_test.cc
@@ -0,0 +1,265 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/mul_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::MulParam;
+using grad_param_t = operators::MulGradParam;
+using kernel_t = MulCompute;
+using grad_kernel_t = MulGradCompute;
+
+class MulGradTester {
+ public:
+  explicit MulGradTester(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int x_num_col_dims,
+                         int y_num_col_dims)
+      : x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_num_col_dims_(x_num_col_dims),
+        y_num_col_dims_(y_num_col_dims) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx2(new KernelContext);
+    ctx2->As<ARMContext>();
+    delta_kernel_.SetContext(std::move(ctx2));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->x = &x;
+    param->y = &y;
+    param->output = &output;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->x = &x;
+    param->x_grad = &x_grad;
+    param->y = &y;
+    param->y_grad = &y_grad;
+    param->output_grad = &out_grad;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad() {
+    std::vector<int64_t> out_shape;
+    for (int i = 0; i < x_num_col_dims_; i++) {
+      out_shape.push_back(x_dims_[i]);
+    }
+    for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
+      out_shape.push_back(y_dims_[i]);
+    }
+    out_dims_ = DDim(out_shape);
+
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+
+    float delta = 0.001;
+    float max_grad_delta = 0.005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      for (int j = 0; j < x_dims_.production(); j++) {
+        if (i == j) {
+          x_delta[j] = x[j] + delta;
+        } else {
+          x_delta[j] = x[j];
+        }
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x_delta, y, out_delta.data());
+
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += (out_delta[j] - out[j]);
+      }
+
+      EXPECT_NEAR(x_grad[i], sum / delta, max_grad_delta);
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      for (int j = 0; j < y_dims_.production(); j++) {
+        y_delta[j] = i == j ? y[j] + delta : y[j];
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x, y_delta, out_delta.data());
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += out_delta[j] - out[j];
+      }
+
+      EXPECT_NEAR(y_grad[i], sum / delta, max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int x_num_col_dims_;
+  int y_num_col_dims_;
+  kernel_t kernel_;
+  kernel_t delta_kernel_;
+  grad_kernel_t grad_kernel_;
+  param_t param_;
+  param_t delta_param_;
+  grad_param_t grad_param_;
+};
+
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int x_num_col_dims,
+                    int y_num_col_dims) {
+  std::unique_ptr<MulGradTester> tester(new MulGradTester(
+      DDim(x_dims), DDim(y_dims), x_num_col_dims, y_num_col_dims));
+
+  tester->prepare_kernel();
+
+  tester->check_grad();
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Mul grad";
+  DeviceInfo::Init();
+  TestNormalCase({1, 3}, {3, 2}, 1, 1);
+  TestNormalCase({3, 2}, {2, 1}, 1, 1);
+  TestNormalCase({3, 1}, {1, 7}, 1, 1);
+  TestNormalCase({2, 3}, {3, 2}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4, 3, 2}, 1, 1);
+  TestNormalCase({3, 4}, {2, 2, 3}, 1, 2);
+  TestNormalCase({4, 20}, {5, 4, 3, 2}, 1, 2);
+  TestNormalCase({4, 60}, {5, 4, 3, 2}, 1, 3);
+  TestNormalCase({2, 3, 4, 5}, {60, 4}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {20, 4}, 2, 1);
+  TestNormalCase({2, 3, 4, 5}, {5, 4}, 3, 1);
+  TestNormalCase({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2);
+  TestNormalCase({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul_grad, kARM, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index 1b20c8eaa2164eaf4d658fba72c28b860b5bea74..4fba28e2ab982b1f15e48c95dfa247b2ea56c1ae 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -45,7 +45,8 @@ class ReshapeComputeTester : public arena::TestCase {
       : TestCase(place, alias), dims_(dims) {
     if (is_shape_tensor_vct) {
       for (size_t i = 0; i < shape.size(); i++) {
-        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
+        shape_tensor_vct_.emplace_back(op_type_ + "/shape" +
+                                       paddle::lite::to_string(i));
       }
     } else if (is_shape_tensor) {
       shape_tensor_ = op_type_ + "/shape";
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
index e8c63e2d729c931578de555cdf16cb066cd40e06..4d698ebc0d42a34cf07a85735c09bd49b3fb1284 100644
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -168,8 +168,9 @@ class SliceComputeTester : public arena::TestCase {
       std::vector<std::string> ends_tensor_list_;
       for (int i = 0; i < starts_.size(); ++i) {
         starts_tensor_list_.push_back("starts_tensor_list_" +
-                                      std::to_string(i));
-        ends_tensor_list_.push_back("ends_tensor_list_" + std::to_string(i));
+                                      paddle::lite::to_string(i));
+        ends_tensor_list_.push_back("ends_tensor_list_" +
+                                    paddle::lite::to_string(i));
       }
       op_desc->SetInput("StartsTensorList", {starts_tensor_list_});
       op_desc->SetInput("EndsTensorList", {ends_tensor_list_});
@@ -203,15 +204,15 @@ class SliceComputeTester : public arena::TestCase {
     } else if (use_tensor_list_) {
       Scope& scope_ = this->scope();
       for (int i = 0; i < starts_.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("starts_tensor_list_" + std::to_string(i));
+        auto* tensor = scope_.NewTensor("starts_tensor_list_" +
+                                        paddle::lite::to_string(i));
         tensor->Resize(DDim({1}));
         auto* d = tensor->mutable_data<int>();
         d[0] = starts_[i];
       }
       for (int i = 0; i < ends_.size(); ++i) {
         auto* tensor =
-            scope_.NewTensor("ends_tensor_list_" + std::to_string(i));
+            scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
         tensor->Resize(DDim({1}));
         auto* d = tensor->mutable_data<int>();
         d[0] = ends_[i];
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index aba7bed4f1508d6dc2e813b16450470972b95de4..461ef7215e3ceb779b2522adbd5bb286036a0d8e 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -123,7 +123,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
     } else if (input_axes_flag_ == 3) {
       std::string name = "axes_tensor_";
       for (size_t i = 0; i < axes_.size(); i++) {
-        name = name + std::to_string(i);
+        name = name + paddle::lite::to_string(i);
         axes_tensor_list_.push_back(name);
         SetCommonTensor(name, DDim({1}), &axes_[i]);
       }
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 93bc95fa4a7136d2127370b076c6b51ccb29c9b5..e28dd6c53e53c477e56e044ada926b4056f1e4e1 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -291,6 +291,8 @@ function make_ios {
             -DLITE_ON_TINY_PUBLISH=ON \
             -DLITE_WITH_OPENMP=OFF \
             -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+            -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
             -DARM_TARGET_ARCH_ABI=$abi \
             -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -354,10 +356,12 @@ function make_x86 {
             -DWITH_LITE=ON \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DLITE_WITH_ARM=OFF \
+            -DLITE_WITH_PYTHON=$BUILD_PYTHON \
             -DWITH_GPU=OFF \
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
             -DLITE_WITH_XPU=$BUID_XPU \
-            -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
 
   make publish_inference -j$NUM_PROC
   cd -
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 884576793db29bd745bc5397ca7d155c9701cd31..703da69fa59f3aa99bad9fb04c0decb591486058 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -184,7 +184,7 @@ function build_opencl {
         return 0
     fi
 
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
     mkdir -p $build_dir
     cd $build_dir
 
@@ -193,11 +193,10 @@ function build_opencl {
     cmake_opencl ${os} ${abi} ${lang}
     make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
     build $TESTS_FILE
-
-    # test publish inference lib
-    make publish_inference -j$NUM_CORES_FOR_COMPILE
 }
 
+
+
 # This method is only called in CI.
 function cmake_x86_for_CI {
     prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
@@ -387,7 +386,7 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
@@ -755,16 +754,58 @@ function arm_push_necessary_file {
     adb -s ${device} push ${testpath} ${adb_work_dir}
 }
 
+
+function test_opencl {
+    os=$1
+    abi=$2
+    lang=$3
+    device=$4
+
+    if [[ ${os} == "armlinux" ]]; then
+        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
+        echo "Skip test arm linux yet. armlinux must in another docker"
+        return 0
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+
+    # prepare for CXXApi test
+    local adb="adb -s ${device}"
+    $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
+
+    # opencl test should be marked with `opencl`
+    opencl_test_mark="opencl"
+
+    for _test in $(cat $TESTS_FILE); do
+        # tell if this test is marked with `opencl`
+        if [[ $_test == *$opencl_test_mark* ]]; then
+            test_arm_android $_test $device
+        fi
+    done
+
+}
+
 function build_test_arm_opencl {
     ########################################################################
     cur=$PWD
+    # job 1-4 must be in one runner
+    prepare_adb_devices
 
     # job 1
     build_opencl "android" "armv8" "gcc"
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv8}
+    test_opencl "android" "armv8" "gcc" ${device_armv8}
     cd $cur
 
     # job 2
     build_opencl "android" "armv7" "gcc"
+    adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv7}
+    test_opencl "android" "armv7" "gcc" ${device_armv7}
     cd $cur
 
     echo "Done"
@@ -1099,6 +1140,8 @@ function main {
                 ;;
             build_test_arm_opencl)
                 build_test_arm_opencl
+                build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
+                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                 shift
                 ;;
             build_test_arm_subtask_android)
diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py
index 35012d5b163aac2b6998790b4cfcf31e16cb1454..0b96652c6f78ee6bcf5498b9247f0a2391c70473 100644
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record kernels in unvalid_places into all_kernel_faked.cc
 
 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from utils import *
 
-if len(sys.argv) != 4:
+if len(sys.argv) != 5:
     print("Error: create_fake_kernel_registry.py requires three inputs!")
     exit(1)
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-kernelmap_path = sys.argv[3]
+kernels_list_path = sys.argv[1]
+faked_kernels_list_path = sys.argv[2]
+dest_path = sys.argv[3]
+kernelmap_path = sys.argv[4]
 
 out_lines = [
     '#pragma once',
@@ -77,68 +79,85 @@ const std::map<std::string, std::string> kernel2path_map{
 '''
 ]
 
+def parse_fake_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
+                        op_type=k.op_type,
+                        target=k.target,
+                        precision=k.precision,
+                        data_layout=k.data_layout,
+                        alias=k.alias
+                    )
+
+                    kernel_define = fake_kernel % (
+                        kernel_name,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        kernel_name
+                    )
+
+                    out_lines.append(kernel_define)
+                    out_lines.append("")
+
+
+                    key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        '::paddle::lite::' + kernel_name,
+                        k.alias
+                    )
+                    out_lines.append(key)
+
+                    for input in k.inputs:
+                        io = '    .BindInput("%s", {%s})' % (input.name, input.type)
+                        out_lines.append(io)
+                    for output in k.outputs:
+                        io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
+                        out_lines.append(io)
+                    out_lines.append("    .Finalize();")
+                    out_lines.append("")
+                    out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
+
+def parse_sppported_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    index = path.rindex('/')
+                    filename = path[index + 1:]
+                    map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        k.alias,
+                        filename.strip()
+                    )
+                    kernel_src_map_lines.append(map_element)
+
+
+parse_fake_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(kernels_list_path)
 
-with open(ops_list_path) as f:
-    paths = set([path for path in f])
-    for path in paths:
-        print('path', path)
-        with open(path.strip()) as g:
-            c = g.read()
-            kernel_parser = RegisterLiteKernelParser(c)
-            kernel_parser.parse()
-
-            for k in kernel_parser.kernels:
-                kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
-                    op_type = k.op_type,
-                    target = k.target,
-                    precision = k.precision,
-                    data_layout = k.data_layout,
-                    alias = k.alias,
-                )
-
-                kernel_define = fake_kernel % (
-                    kernel_name,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    kernel_name,
-                )
-
-                out_lines.append(kernel_define)
-                out_lines.append("")
-
-
-                key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    '::paddle::lite::' + kernel_name,
-                    k.alias,
-                )
-                out_lines.append(key)
-
-                for input in k.inputs:
-                    io = '    .BindInput("%s", {%s})' % (input.name, input.type)
-                    out_lines.append(io)
-                for output in k.outputs:
-                    io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
-                    out_lines.append(io)
-                out_lines.append("    .Finalize();")
-                out_lines.append("")
-                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
-
-                index = path.rindex('/')
-                filename = path[index + 1:]
-                map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    k.alias,
-                    filename.strip()
-                )
-                kernel_src_map_lines.append(map_element)
 with open(dest_path, 'w') as f:
     logging.info("write kernel list to %s" % dest_path)
     f.write('\n'.join(out_lines))
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index f6a3af6bd3e5a2decfb6b3b65b0357bff8b4a378..560174bc632bec89b9655ff89fd5eeb9e7db7786 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record supported ops from kernels_src.txt
 
 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from ast import RegisterLiteOpParser
 
-if len(sys.argv) != 4:
-    print("Error: record_supported_kernel_op.py requires three inputs!")
-    exit(1)
+if len(sys.argv) != 5:
+    print("Error: record_supported_kernel_op.py requires four inputs!")
+    sys.exit(1)
 kernels_list_path = sys.argv[1]
-ops_list_path = sys.argv[2]
-kernel_op_map_dest_path = sys.argv[3]
+faked_kernels_list_path = sys.argv[2]
+ops_list_path = sys.argv[3]
+kernel_op_map_dest_path = sys.argv[4]
 
 
 out_lines = [
@@ -51,11 +53,11 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 '''
 ]
 
-ops_lines=[]
+ops_lines = []
 
 # valid targets and valid_ops
 valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[]]
+valid_ops = [[], [], [], [], [], [], [], [], [], []]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -78,8 +80,21 @@ with open(kernels_list_path) as f:
             kernel_parser.parse()
             for k in kernel_parser.kernels:
                 if hasattr(TargetType, k.target):
-                    index=getattr(TargetType, k.target)
+                    index = getattr(TargetType, k.target)
                     valid_ops[index].append(k.op_type)
+# record op_info of valid kernels into `valid_ops` according to different target type
+with open(faked_kernels_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        with open(path.strip()) as g:
+            c = g.read()
+            kernel_parser = RegisterLiteKernelParser(c)
+            kernel_parser.parse()
+            for k in kernel_parser.kernels:
+                if hasattr(TargetType, k.target):
+                    index = getattr(TargetType, k.target)
+                    valid_ops[index].append(k.op_type)
+
 
 # clear the repeated ops
 for target in valid_targets:
@@ -114,7 +129,7 @@ with open(kernel_op_map_dest_path, 'w') as f:
     f.write('\n'.join(out_lines))
     # write kernels into head file
     for target in valid_targets:
-        if len(valid_ops[getattr(TargetType, target)]) == 0 :
+        if len(valid_ops[getattr(TargetType, target)]) == 0:
             f.write("\n    // %s_OPS: " %target)
             f.write('\n    {},')
         else:
diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt
index 153487623bd3539505543ba1bdc155f77f6c22c9..f07350a4720d7f7eaa268fcaaddf8de31357725d 100644
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -1,6 +1,7 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
     lite_cc_library(paddle_cv_arm SRCS
             image_convert.cc
+            bgr_rotate.cc
             paddle_image_preprocess.cc
             image2tensor.cc
             image_flip.cc
diff --git a/lite/utils/cv/bgr_rotate.cc b/lite/utils/cv/bgr_rotate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93d280b89de8b729af3ed2b1c86d6b2c7e8771c8
--- /dev/null
+++ b/lite/utils/cv/bgr_rotate.cc
@@ -0,0 +1,1507 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// ncnn license
+// Tencent is pleased to support the open source community by making ncnn
+// available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lite/utils/cv/bgr_rotate.h"
+#include <arm_neon.h>
+#include <math.h>
+#include <algorithm>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+#ifdef __aarch64__
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  [w_out * h_out * 3];
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      uint8_t* outptr1 = outptr0 + wout;
+      uint8_t* outptr2 = outptr1 + wout;
+      uint8_t* outptr3 = outptr2 + wout;
+      uint8_t* outptr4 = outptr3 + wout;
+      uint8_t* outptr5 = outptr4 + wout;
+      uint8_t* outptr6 = outptr5 + wout;
+      uint8_t* outptr7 = outptr6 + wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v12.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v13.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v14.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v15.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v16.8b, v16.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v17.8b, v17.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"
+          // //00 10 20 30 04 14 24 34
+          // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"
+          // //02 12 22 32
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v6.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v7.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v8.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v24.8b, v24.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v25.8b, v25.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v26.8b, v26.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v9.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v10.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v11.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v27.8b, v27.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v28.8b, v28.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v29.8b, v29.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v0.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v1.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v2.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v18.8b, v18.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v19.8b, v19.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v20.8b, v20.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29",
+            "v30");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+#ifdef __aarch64__
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      uint8_t* outptr1 = outptr0 - wout;
+      uint8_t* outptr2 = outptr1 - wout;
+      uint8_t* outptr3 = outptr2 - wout;
+      uint8_t* outptr4 = outptr3 - wout;
+      uint8_t* outptr5 = outptr4 - wout;
+      uint8_t* outptr6 = outptr5 - wout;
+      uint8_t* outptr7 = outptr6 - wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+#ifdef __aarch64__
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[30000];  // [w_in];
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int64_t stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#else
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[30000];  // w_in
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#endif
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/cv/bgr_rotate.h b/lite/utils/cv/bgr_rotate.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb85da56955154863eb17595ebb5b58d79cd6a83
--- /dev/null
+++ b/lite/utils/cv/bgr_rotate.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index 2fc884a0fa4fd359c150115a20bdf751094b4687..f4a80ed6255186b8c1b59a8d56fd64b78c9bc1d2 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -664,15 +664,6 @@ void resize(const uint8_t* src,
     memcpy(dst, src, sizeof(uint8_t) * size);
     return;
   }
-  double scale_x = static_cast<double>(srcw) / dstw;
-  double scale_y = static_cast<double>(srch) / dsth;
-
-  int* buf = new int[dstw * 2 + dsth * 3];
-
-  int* xofs = buf;
-  int* yofs = buf + dstw;
-  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
 
   int w_out = dstw;
   int w_in = srcw;
@@ -692,12 +683,19 @@ void resize(const uint8_t* src,
     w_in = srcw * 3;
     w_out = dstw * 3;
     num = 3;
-
   } else if (srcFormat == BGRA || srcFormat == RGBA) {
     w_in = srcw * 4;
     w_out = dstw * 4;
     num = 4;
   }
+  double scale_x = static_cast<double>(srcw) / dstw;
+  double scale_y = static_cast<double>(srch) / dsth;
+
+  int* buf = new int[dstw * 2 + dsth * 3];
+  int* xofs = buf;
+  int* yofs = buf + dstw;
+  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
 
   compute_xy(
       srcw, srch, dstw, orih, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
@@ -726,10 +724,10 @@ void resize(const uint8_t* src,
   int remain = w_out % 8;
   int32x4_t _v2 = vdupq_n_s32(2);
   int prev_sy1 = -1;
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
 #pragma omp parallel for
   for (int dy = 0; dy < dsth; dy++) {
-    int16_t* rowsbuf0 = new int16_t[w_out + 1];
-    int16_t* rowsbuf1 = new int16_t[w_out + 1];
     int sy = yofs[dy];
     if (dy >= orih) {
       xofs = xofs1;
@@ -853,8 +851,6 @@ void resize(const uint8_t* src,
                     2);
     }
     ibeta += 2;
-    delete[] rowsbuf0;
-    delete[] rowsbuf1;
   }
   if (orih < dsth) {  // uv
     delete[] xofs1;
@@ -862,6 +858,8 @@ void resize(const uint8_t* src,
     delete[] ialpha1;
   }
   delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
 }
 // compute xofs, yofs, alpha, beta
 void compute_xy(int srcw,
diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc
index 4ef757793ec009f4a4807499b1c48ac908393966..c87fc4def24220e240168a7114910c7c9ecee5ba 100644
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/cv/image_rotate.h"
 #include <math.h>
 #include <string.h>
+#include "lite/utils/cv/bgr_rotate.h"
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
   if (srcFormat == GRAY) {
     rotate_hwc1(src, dst, srcw, srch, degree);
   } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
+    // rotate_hwc3(src, dst, srcw, srch, degree);
+    bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
   } else if (srcFormat == BGRA || srcFormat == RGBA) {
     rotate_hwc4(src, dst, srcw, srch, degree);
   } else {
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index c2c999fd70f3eee78c1deaf5ec2c4fea4e4f3fd1..3d97f4dbec1e4973295248c94c4156563dfb4f5d 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -29,6 +29,7 @@
 #include <cstring>
 #include <string>
 #include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
 
 #ifdef LITE_WITH_ANDROID
 #include <android/log.h>
@@ -171,7 +172,7 @@ class VLogMessage {
     if (GLOG_v_int < level_int) {
       return;
     }
-    const char* level = std::to_string(level_int).c_str();
+    const char* level = paddle::lite::to_string(level_int).c_str();
     paddle::lite::gen_log(log_stream_, file, func, lineno, level);
   }
 
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index e72f2717293d0cc07ac28c6d51dd4d2bb5ae7874..37b02d3c50b8ed78bb8335a1618f753f645fd00b 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/replace_stl/stream.h"
 #include <assert.h>
 #include <stdio.h>
+#include "lite/utils/string.h"
 
 #ifdef LITE_ON_TINY_PUBLISH
 
@@ -39,9 +40,9 @@ void ostream::pad(const std::string& text) {
 #ifdef LITE_SHUTDOWN_LOG
 #define ADD_DATA_AS_STRING(data_, obj_)
 #else
-#define ADD_DATA_AS_STRING(data_, obj_)    \
-  std::string text = std::to_string(obj_); \
-  pad(text);                               \
+#define ADD_DATA_AS_STRING(data_, obj_)             \
+  std::string text = paddle::lite::to_string(obj_); \
+  pad(text);                                        \
   data_ = data_ + text;
 
 #endif
diff --git a/lite/utils/string.h b/lite/utils/string.h
index d96b2aac20549989afdc730e34af4fc40541329d..5269525b64f473f1018e183613c087886dba97d6 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -48,7 +48,14 @@ template <typename T>
 static std::string to_string_with_precision(const T& v, const int n = 6) {
   STL::stringstream ss;
   ss.precision(n);
-  // ss << std::fixed << v;
+  ss << v;
+  return ss.str();
+}
+
+template <typename T>
+static std::string to_string(const T& v) {
+  STL::stringstream ss;
+  ss << v;
   return ss.str();
 }